diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b257477a25c1efb2327a6e89111ed6b30f93b962 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +peft/examples/hra_dreambooth/a_purple_qwe_backpack.png filter=lfs diff=lfs merge=lfs -text diff --git a/peft/.github/ISSUE_TEMPLATE/bug-report.yml b/peft/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..82ad94acbca3dbeb54bf360c3b93462d75b22b82 --- /dev/null +++ b/peft/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,54 @@ +name: "\U0001F41B Bug Report" +description: Submit a bug report to help us improve the library +body: + - type: textarea + id: system-info + attributes: + label: System Info + description: Please share your relevant system information with us + placeholder: peft & accelerate & transformers version, platform, python version, ... + validations: + required: true + + - type: textarea + id: who-can-help + attributes: + label: Who can help? + description: | + Your issue will be replied to more quickly if you can figure out the right person to tag with @. + If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**. + + All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and + a core maintainer will ping the right person. + + Please tag fewer than 3 people. + + Library: @benjaminbossan @githubnemo + + diffusers integration: @benjaminbossan @sayakpaul + + Documentation: @stevhliu + + placeholder: "@Username ..." + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. + Please provide the simplest reproducer as possible so that we can quickly fix the issue. When you paste + the error message, please include the full traceback. + + placeholder: | + Reproducer: + + - type: textarea + id: expected-behavior + validations: + required: true + attributes: + label: Expected behavior + description: "A clear and concise description of what you would expect to happen." diff --git a/peft/.github/ISSUE_TEMPLATE/feature-request.yml b/peft/.github/ISSUE_TEMPLATE/feature-request.yml new file mode 100644 index 0000000000000000000000000000000000000000..5e0b73e1f3afa9a865220346bc0ff2586c542ec8 --- /dev/null +++ b/peft/.github/ISSUE_TEMPLATE/feature-request.yml @@ -0,0 +1,21 @@ +name: "\U0001F680 Feature request" +description: Submit a proposal/request for a new feature +labels: [ "feature" ] +body: + - type: textarea + id: feature-request + validations: + required: true + attributes: + label: Feature request + description: | + A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. + + - type: textarea + id: contribution + validations: + required: true + attributes: + label: Your contribution + description: | + Is there any way that you could help, e.g. by submitting a PR? diff --git a/peft/.github/workflows/build_docker_images.yml b/peft/.github/workflows/build_docker_images.yml new file mode 100644 index 0000000000000000000000000000000000000000..98f5ee7dc9f31ce34bb5425f5d6748ae3c1a6d54 --- /dev/null +++ b/peft/.github/workflows/build_docker_images.yml @@ -0,0 +1,150 @@ +name: Build Docker images (scheduled) + +on: + workflow_dispatch: + workflow_call: + schedule: + - cron: "0 1 * * *" + +concurrency: + group: docker-image-builds + cancel-in-progress: false + +permissions: {} + +env: + CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }} + +jobs: + latest-cpu: + name: "Latest Peft CPU [dev]" + runs-on: + group: aws-general-8-plus + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Login to DockerHub + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Build and Push CPU + uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1 # v6.16.0 + with: + context: ./docker/peft-cpu + push: true + tags: huggingface/peft-cpu + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ env.CI_SLACK_CHANNEL }} + title: 🤗 Results of the PEFT-CPU docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-cuda: + name: "Latest Peft GPU [dev]" + runs-on: + group: aws-general-8-plus + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Login to DockerHub + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Build and Push GPU + uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1 # v6.16.0 + with: + context: ./docker/peft-gpu + push: true + tags: huggingface/peft-gpu + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ env.CI_SLACK_CHANNEL }} + title: 🤗 Results of the PEFT-GPU docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-cuda-bnb-source: + name: "Latest Peft GPU + bnb source [dev]" + runs-on: + group: aws-general-8-plus + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Login to DockerHub + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Build and Push GPU + uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1 # v6.16.0 + with: + context: ./docker/peft-gpu-bnb-source + push: true + tags: huggingface/peft-gpu-bnb-source + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ env.CI_SLACK_CHANNEL }} + title: 🤗 Results of the PEFT-GPU (bnb source / HF latest) docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-cuda-bnb-source-latest: + name: "Latest Peft GPU + bnb source [accelerate / peft / transformers latest]" + runs-on: + group: aws-general-8-plus + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Login to DockerHub + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Build and Push GPU + uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1 # v6.16.0 + with: + context: ./docker/peft-gpu-bnb-latest + push: true + tags: huggingface/peft-gpu-bnb-latest + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ env.CI_SLACK_CHANNEL }} + title: 🤗 Results of the PEFT-GPU (bnb source / HF source) docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + diff --git a/peft/.github/workflows/build_documentation.yml b/peft/.github/workflows/build_documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..650febf8568a90f8fa5a826b514a872d78606b64 --- /dev/null +++ b/peft/.github/workflows/build_documentation.yml @@ -0,0 +1,22 @@ +name: Build documentation + +on: + push: + branches: + - main + - doc-builder* + - v*-release + +permissions: {} + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@ba4b74d11c46d884a4cf6497687c090f55f027d9 # main from 2025-09-05 + with: + commit_sha: ${{ github.sha }} + package: peft + notebook_folder: peft_docs + custom_container: huggingface/transformers-doc-builder + secrets: + token: ${{ secrets.HUGGINGFACE_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/peft/.github/workflows/build_pr_documentation.yml b/peft/.github/workflows/build_pr_documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..c406681db25ef3321d2b1de3604c4d58d8f9818a --- /dev/null +++ b/peft/.github/workflows/build_pr_documentation.yml @@ -0,0 +1,19 @@ +name: Build PR Documentation + +on: + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: {} + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@ba4b74d11c46d884a4cf6497687c090f55f027d9 # main from 2025-09-05 + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + package: peft + custom_container: huggingface/transformers-doc-builder diff --git a/peft/.github/workflows/deploy_method_comparison_app.yml b/peft/.github/workflows/deploy_method_comparison_app.yml new file mode 100644 index 0000000000000000000000000000000000000000..e1f8b3dd1dc4dc0e5cf424603382900e63f8e6cc --- /dev/null +++ b/peft/.github/workflows/deploy_method_comparison_app.yml @@ -0,0 +1,41 @@ +name: Deploy "method_comparison" Gradio to Spaces + +on: + push: + branches: [ main ] + paths: + - "method_comparison/**" + workflow_dispatch: + +permissions: {} + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 # full history needed for subtree + persist-credentials: false + + - name: Authenticate via ~/.netrc + env: + HF_TOKEN: ${{ secrets.PEFT_INTERNAL_REPO_READ_WRITE }} + run: | + # netrc needs BOTH login and password entries + printf "machine huggingface.co\nlogin hf\npassword ${HF_TOKEN}\n" >> ~/.netrc + chmod 600 ~/.netrc + + - name: Deploy method_comparison app to HF Spaces + run: | + cd method_comparison + git init + # Spaces expect requirements.txt + mv requirements-app.txt requirements.txt + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git remote add gradio-app https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison + git add . + git commit -m "🚀 Deploy method comparison app from GH action" + git push -f gradio-app HEAD:main diff --git a/peft/.github/workflows/integrations_tests.yml b/peft/.github/workflows/integrations_tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..a1f5dd0e41a12d2a67ab327dd4c590961c4b774c --- /dev/null +++ b/peft/.github/workflows/integrations_tests.yml @@ -0,0 +1,86 @@ +name: integration tests + +on: + workflow_dispatch: + inputs: + branch: + description: 'Branch to test on' + required: true + +permissions: {} + +jobs: + run_transformers_integration_tests: + strategy: + fail-fast: false + matrix: + transformers-version: ['main', 'latest'] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + ref: ${{ github.event.inputs.branch }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "setup.py" + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${CI_BRANCH}" + echo "env.CI_SHA = ${CI_SHA}" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[test] + if [ "${{ matrix.transformers-version }}" == "main" ]; then + pip install -U git+https://github.com/huggingface/transformers.git + else + echo "Nothing to do as transformers latest already installed" + fi + + - name: Test transformers integration + run: | + cd .. && git clone https://github.com/huggingface/transformers.git && cd transformers/ && git rev-parse HEAD + RUN_SLOW=1 pytest tests/peft_integration/test_peft_integration.py + run_diffusers_integration_tests: + strategy: + fail-fast: false + matrix: + # For now diffusers integration is not on PyPI + diffusers-version: ['main'] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + ref: ${{ github.event.inputs.branch }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: "3.10" + cache: "pip" + cache-dependency-path: "setup.py" + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${CI_BRANCH}" + echo "env.CI_SHA = ${CI_SHA}" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[test] + + if [ "${{ matrix.diffusers-version }}" == "main" ]; then + pip install -U git+https://github.com/huggingface/diffusers.git + else + echo "Nothing to do as diffusers latest already installed" + fi + + - name: Test diffusers integration + run: | + cd .. && git clone https://github.com/huggingface/diffusers.git && cd diffusers/ && git rev-parse HEAD + pytest tests/lora/test_lora_layers_peft.py diff --git a/peft/.github/workflows/nightly-bnb.yml b/peft/.github/workflows/nightly-bnb.yml new file mode 100644 index 0000000000000000000000000000000000000000..c2944f330491734a02480891edfe58b510353429 --- /dev/null +++ b/peft/.github/workflows/nightly-bnb.yml @@ -0,0 +1,249 @@ +name: BNB from source self-hosted runner with slow tests (scheduled) + +on: + workflow_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + RUN_SLOW: "yes" + IS_GITHUB_CI: "1" + # To be able to run tests on CUDA 12.2 + NVIDIA_DISABLE_REQUIRE: "1" + SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + +permissions: {} + +jobs: + run_all_tests_single_gpu: + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"] + runs-on: + group: aws-g6-4xlarge-plus + env: + CUDA_VISIBLE_DEVICES: "0" + TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}" + container: + image: ${{ matrix.docker-image-name }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog pytest-cov parameterized datasets scipy einops + pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26 + mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone # rename to transformers clone to avoid modules conflict + if [ "${{ matrix.docker-image-name }}" == "huggingface/peft-gpu-bnb-latest:latest" ]; then + cd transformers-clone + transformers_version=$(pip show transformers | grep '^Version:' | cut -d ' ' -f2 | sed 's/\.dev0//') + echo "Checking out tag for Transformers version: v$transformers_version" + git fetch --tags + git checkout tags/v$transformers_version + cd .. + fi + + - name: Test bnb import + id: import + if: always() + run: | + source activate peft + python3 -m bitsandbytes + python3 -c "import bitsandbytes as bnb" + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes import + status: ${{ steps.import.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run examples on single GPU + id: examples_tests + if: always() + run: | + source activate peft + make tests_examples_single_gpu_bnb + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes examples tests - single GPU + status: ${{ steps.examples_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run core tests on single GPU + id: core_tests + if: always() + run: | + source activate peft + make tests_core_single_gpu_bnb + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes core tests - single GPU + status: ${{ steps.core_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + # TODO: this is a test to see if BNB multi-backend single-GPU tests succeed w/o regression tests + # - name: Run BNB regression tests on single GPU + # id: regression_tests + # if: always() + # run: | + # source activate peft + # make tests_gpu_bnb_regression + + # - name: Post to Slack + # if: always() + # uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + # with: + # slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + # title: 🤗 Results of bitsandbytes regression tests - single GPU + # status: ${{ steps.regression_tests.outcome }} + # slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run transformers tests on single GPU + id: transformers_tests + if: always() + run: | + source activate peft + make transformers_tests + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes transformers tests - single GPU + status: ${{ steps.transformers_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py --slack_channel_name bnb-daily-ci-collab >> $GITHUB_STEP_SUMMARY + + run_all_tests_multi_gpu: + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + docker-image-name: ["huggingface/peft-gpu-bnb-source:latest", "huggingface/peft-gpu-bnb-latest:latest"] + runs-on: + group: aws-g6-12xlarge-plus + env: + CUDA_VISIBLE_DEVICES: "0,1" + TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}" + container: + image: ${{ matrix.docker-image-name }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog pytest-cov parameterized datasets scipy einops + pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26 + mkdir transformers-clone && git clone https://github.com/huggingface/transformers.git transformers-clone + if [ "${{ matrix.docker-image-name }}" == "huggingface/peft-gpu-bnb-latest:latest" ]; then + cd transformers-clone + transformers_version=$(pip show transformers | grep '^Version:' | cut -d ' ' -f2 | sed 's/\.dev0//') + echo "Checking out tag for Transformers version: v$transformers_version" + git fetch --tags + git checkout tags/v$transformers_version + cd .. + fi + + - name: Test bnb import + id: import + if: always() + run: | + source activate peft + python3 -m bitsandbytes + python3 -c "import bitsandbytes as bnb" + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes import + status: ${{ steps.import.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run examples on multi GPU + id: examples_tests + if: always() + run: | + source activate peft + make tests_examples_multi_gpu_bnb + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes examples tests - multi GPU + status: ${{ steps.examples_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run core tests on multi GPU + id: core_tests + if: always() + run: | + source activate peft + make tests_core_multi_gpu_bnb + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes core tests - multi GPU + status: ${{ steps.core_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Run transformers tests on multi GPU + id: transformers_tests + if: always() + run: | + source activate peft + make transformers_tests + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.BNB_SLACK_CHANNEL_ID }} + title: 🤗 Results of bitsandbytes transformers tests - multi GPU + status: ${{ steps.transformers_tests.outcome }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py --slack_channel_name bnb-daily-ci-collab >> $GITHUB_STEP_SUMMARY diff --git a/peft/.github/workflows/nightly.yml b/peft/.github/workflows/nightly.yml new file mode 100644 index 0000000000000000000000000000000000000000..dabfea90359afafa128767e10d1a8193ea572058 --- /dev/null +++ b/peft/.github/workflows/nightly.yml @@ -0,0 +1,115 @@ +name: Self-hosted runner with slow tests (scheduled) + +on: + workflow_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + RUN_SLOW: "yes" + IS_GITHUB_CI: "1" + # To be able to run tests on CUDA 12.2 + NVIDIA_DISABLE_REQUIRE: "1" + SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + +permissions: {} + +jobs: + run_all_tests_single_gpu: + strategy: + fail-fast: false + runs-on: + group: aws-g6-4xlarge-plus + env: + CUDA_VISIBLE_DEVICES: "0" + TEST_TYPE: "single_gpu" + container: + image: huggingface/peft-gpu:latest + options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog + + - name: Run common tests on single GPU + run: | + source activate peft + make tests_common_gpu + + - name: Run examples on single GPU + run: | + source activate peft + make tests_examples_single_gpu + + - name: Run core tests on single GPU + run: | + source activate peft + make tests_core_single_gpu + + - name: Run regression tests on single GPU + run: | + source activate peft + make tests_regression + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY + + run_all_tests_multi_gpu: + strategy: + fail-fast: false + runs-on: + group: aws-g6-12xlarge-plus + env: + CUDA_VISIBLE_DEVICES: "0,1" + TEST_TYPE: "multi_gpu" + container: + image: huggingface/peft-gpu:latest + options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-reportlog + + - name: Run core GPU tests on multi-gpu + run: | + source activate peft + + - name: Run common tests on multi GPU + run: | + source activate peft + make tests_common_gpu + + - name: Run examples on multi GPU + run: | + source activate peft + make tests_examples_multi_gpu + + - name: Run core tests on multi GPU + run: | + source activate peft + make tests_core_multi_gpu + + - name: Generate Report + if: always() + run: | + pip install slack_sdk tabulate + python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY diff --git a/peft/.github/workflows/stale.yml b/peft/.github/workflows/stale.yml new file mode 100644 index 0000000000000000000000000000000000000000..8c8398b2e06f929a444ed3b48ea076f99927675d --- /dev/null +++ b/peft/.github/workflows/stale.yml @@ -0,0 +1,34 @@ +name: Stale Bot + +on: + schedule: + - cron: "0 15 * * *" + +permissions: {} + +jobs: + close_stale_issues: + name: Close Stale Issues + if: github.repository == 'huggingface/peft' + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + + - name: Setup Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3.11 + + - name: Install requirements + run: | + pip install PyGithub + - name: Close stale issues + run: | + python scripts/stale.py diff --git a/peft/.github/workflows/test-docker-build.yml b/peft/.github/workflows/test-docker-build.yml new file mode 100644 index 0000000000000000000000000000000000000000..5b3e4f58f8c95bd454207c7ca8df7693a34a4b0c --- /dev/null +++ b/peft/.github/workflows/test-docker-build.yml @@ -0,0 +1,66 @@ +name: Test Docker images (on PR) + +on: + pull_request: + paths: + # Run only when DockerFile files are modified + - "docker/*/Dockerfile" + +permissions: {} + +jobs: + get_changed_files: + name: "Build all modified docker images" + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c #v42 + with: + files: docker/*/Dockerfile + json: "true" + - name: Run step if only the files listed above change + if: steps.changed-files.outputs.any_changed == 'true' + id: set-matrix + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + echo "matrix=${ALL_CHANGED_FILES}" >> $GITHUB_OUTPUT + build_modified_files: + needs: get_changed_files + name: Build Docker images on modified files + runs-on: ubuntu-latest + if: ${{ needs.get_changed_files.outputs.matrix != '[]' }} + strategy: + fail-fast: false + matrix: + docker-file: ${{ fromJson(needs.get_changed_files.outputs.matrix) }} + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + - name: Check out code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Build Docker image + uses: docker/build-push-action@14487ce63c7a62a4a324b0bfb37086795e31c6c1 # v6.16.0 + with: + file: ${{ matrix.docker-file }} + context: . + push: False diff --git a/peft/.github/workflows/tests-main.yml b/peft/.github/workflows/tests-main.yml new file mode 100644 index 0000000000000000000000000000000000000000..5f0806a2257959f15d484167333f6f9991a533f8 --- /dev/null +++ b/peft/.github/workflows/tests-main.yml @@ -0,0 +1,43 @@ +name: tests on transformers main + +on: + push: + branches: [main] + paths-ignore: + - 'docs/**' + +permissions: {} + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Set up Python 3.11 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: 3.11 + cache: "pip" + cache-dependency-path: "setup.py" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + # cpu version of pytorch + pip install -U git+https://github.com/huggingface/transformers.git + pip install -e .[test] + - name: Test with pytest + env: + TRANSFORMERS_IS_CI: 1 + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + make test + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@3f88d63d3761558a32e8e46fc2a8536e04bb2aea # main from Feb 2025-02-24 + with: + slack_channel: ${{ secrets.SLACK_CHANNEL_ID }} + title: 🤗 Results of transformers main tests + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} diff --git a/peft/.github/workflows/tests.yml b/peft/.github/workflows/tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..655189c7c38498d5f03654f01cb732107bc558bc --- /dev/null +++ b/peft/.github/workflows/tests.yml @@ -0,0 +1,133 @@ +name: tests + +on: + push: + branches: [main] + paths-ignore: + - 'docs/**' + pull_request: + paths-ignore: + - 'docs/**' + +env: + HF_HOME: .cache/huggingface + +permissions: {} + +jobs: + check_code_quality: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: "3.11" + cache: "pip" + cache-dependency-path: "setup.py" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[dev] + - name: Check quality + run: | + make quality + + tests: + needs: check_code_quality + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.11", "3.12", "3.13"] + os: ["ubuntu-latest", "macos-13", "windows-latest"] + exclude: + - os: macos-13 + python-version: "3.13" + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Model cache + uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 + with: + # Avoid caching HF_HOME/modules and Python cache files to prevent interoperability + # issues and potential cache poisioning. We also avoid lock files to prevent runs + # avoiding re-download because they see a lock file. + path: | + ${{ env.HF_HOME }}/hub/** + !${{ env.HF_HOME }}/**/*.pyc + key: model-cache-${{ github.run_id }} + restore-keys: model-cache- + enableCrossOsArchive: true + - name: Dump cache content + # TODO: remove this step after 2025-02-15 + if: matrix.os != 'windows-latest' + run: | + SHASUM=sha256sum + [ -f "$(which shasum)" ] && SHASUM=shasum + find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_initial || true + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: "setup.py" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools + # cpu version of pytorch + pip install -e .[test] + - name: Test with pytest + # MacOS tests are currently too flaky and will fail almost each time. Thus, continue (green checkmark) even if + # they fail, but add a notice so that the failure is not completely silent + continue-on-error: ${{ matrix.os == 'macos-13' }} + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + TRANSFORMERS_IS_CI: 1 + run: | + set +e + make test + status=$? + # Post a notice only if this is macOS AND tests failed + if [ "$status" -ne 0 ] && [ "${{ matrix.os }}" = "macos-13" ]; then + { + echo "## ⚠️ macOS tests failed" + echo "" + echo "- OS: ${{ matrix.os }}" + echo "- Python: ${{ matrix.python-version }}" + echo "" + echo "Check the logs from this step for details." + } >> "$GITHUB_STEP_SUMMARY" + fi + # Return the real status. On macOS this won't fail the job because of continue-on-error. + exit $status + - name: Dump cache content and diff + # This is just debug info so that we can monitor if the model cache diverges substantially + # over time and what the diverging model is. + # TODO: remove after 2025-02-15 + if: matrix.os != 'windows-latest' + run: | + SHASUM=sha256sum + [ -f "$(which shasum)" ] && SHASUM=shasum + find "${{ env.HF_HOME }}/hub" -type f -exec "$SHASUM" {} \; > cache_content_after || true + diff -udp cache_content_initial cache_content_after || true + - name: Delete old model cache entries + run: | + # make sure that cache cleaning doesn't break the pipeline + python scripts/ci_clean_cache.py -d || true + - name: Update model cache + uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 + # Only let one runner (preferably the one that covers most tests) update the model cache + # after *every* run. This way we make sure that our cache is never outdated and we don't + # have to keep track of hashes. + if: always() && matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' + with: + path: | + ${{ env.HF_HOME }}/hub/** + !${{ env.HF_HOME }}/**/*.pyc + key: model-cache-${{ github.run_id }} diff --git a/peft/.github/workflows/torch_compile_tests.yml b/peft/.github/workflows/torch_compile_tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..5f5b0565889dd38c7e4a3232b0062ee551947e3b --- /dev/null +++ b/peft/.github/workflows/torch_compile_tests.yml @@ -0,0 +1,56 @@ +name: torch compile tests + +on: + workflow_dispatch: + inputs: + branch: + description: 'Branch to test on' + required: true + pytorch_nightly: + description: 'Whether to use PyTorch nightly (true/false)' + required: false + default: false + +env: + RUN_SLOW: "yes" + IS_GITHUB_CI: "1" + # To be able to run tests on CUDA 12.2 + NVIDIA_DISABLE_REQUIRE: "1" + +permissions: {} + +jobs: + run_tests_with_compile: + runs-on: + group: aws-g6-4xlarge-plus + env: + PEFT_DEBUG_WITH_TORCH_COMPILE: 1 + CUDA_VISIBLE_DEVICES: "0" + TEST_TYPE: "single_gpu_huggingface/peft-gpu-bnb-latest:latest" + USE_PYTORCH_NIGHTLY: "${{ github.event.inputs.pytorch_nightly }}" + container: + image: "huggingface/peft-gpu-bnb-latest:latest" + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + ref: ${{ github.event.inputs.branch }} + repository: ${{ github.event.pull_request.head.repo.full_name }} + persist-credentials: false + - name: Pip install + run: | + source activate peft + pip install -e . --no-deps + pip install pytest-cov pytest-reportlog parameterized datasets scipy einops + pip install "pytest>=7.2.0,<8.0.0" # see: https://github.com/huggingface/transformers/blob/ce4fff0be7f6464d713f7ac3e0bbaafbc6959ae5/setup.py#L148C6-L148C26 + if [ "${USE_PYTORCH_NIGHTLY}" = "true" ]; then + python -m pip install --upgrade --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu + fi + - name: Test compile with pytest + run: | + source activate peft + echo "PEFT_DEBUG_WITH_TORCH_COMPILE=$PEFT_DEBUG_WITH_TORCH_COMPILE" + make tests_torch_compile diff --git a/peft/.github/workflows/trufflehog.yml b/peft/.github/workflows/trufflehog.yml new file mode 100644 index 0000000000000000000000000000000000000000..099b77662be04aa0f83d16d649c6199c230aed87 --- /dev/null +++ b/peft/.github/workflows/trufflehog.yml @@ -0,0 +1,18 @@ +on: + push: + +name: Secret Leaks + +permissions: {} + +jobs: + trufflehog: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + persist-credentials: false + - name: Secret Scanning + uses: trufflesecurity/trufflehog@0f58ae7c5036094a1e3e750d18772af92821b503 # v3.90.5 diff --git a/peft/.github/workflows/upload_pr_documentation.yml b/peft/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..0d9eadf82b7fd2b4ecbfe37a20650acfd7a7aca9 --- /dev/null +++ b/peft/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,18 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +permissions: {} + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@ba4b74d11c46d884a4cf6497687c090f55f027d9 # main from 2025-09-05 + with: + package_name: peft + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} diff --git a/peft/.github/workflows/zizmor.yaml b/peft/.github/workflows/zizmor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5d7d8b4a1aa00f8603d06a16c7fcc0fcb0ff19d --- /dev/null +++ b/peft/.github/workflows/zizmor.yaml @@ -0,0 +1,28 @@ +name: CI security linting + +on: + push: + branches: ["main"] + pull_request: + branches: ["*"] + paths: + - '.github/**' + +permissions: {} + +jobs: + zizmor: + name: zizmor latest via Cargo + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - name: Checkout repository + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + persist-credentials: false + - name: Install zizmor + run: cargo install --locked zizmor + - name: Run zizmor + run: zizmor .github/workflows diff --git a/peft/.github/zizmor.yml b/peft/.github/zizmor.yml new file mode 100644 index 0000000000000000000000000000000000000000..c3b44d766eaa8219c438c748cadca679a182c3ab --- /dev/null +++ b/peft/.github/zizmor.yml @@ -0,0 +1,24 @@ +rules: + dangerous-triggers: + ignore: + # this workflow is only triggered after maintainer approval + - upload_pr_documentation.yml:3:1 + cache-poisoning: + ignore: + # the docker buildx binary is cached and zizmor warns about a cache poisoning attack. + # OTOH this cache would make us more resilient against an intrusion on docker-buildx' side. + # There is no obvious benefit so we leave it as it is. + - build_docker_images.yml:37:9 + - build_docker_images.yml:70:9 + - build_docker_images.yml:103:9 + - build_docker_images.yml:136:9 + - build_docker_images.yml:169:9 + unpinned-images: + ignore: + # We want to test these images with the latest version and we're not using them + # to deploy anything so we deem it safe to use those, even if they are unpinned. + - nightly-bnb.yml:30:7 + - nightly-bnb.yml:155:7 + - nightly.yml:27:7 + - nightly.yml:77:7 + - torch_compile_tests.yml:32:7 diff --git a/peft/.gitignore b/peft/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4e3e2ca5fc7e93d32d06e8e1a696925c3eaac60d --- /dev/null +++ b/peft/.gitignore @@ -0,0 +1,145 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# VSCode +.vscode + +# IntelliJ +.idea + +# Mac .DS_Store +.DS_Store + +# More test things +wandb + +# method_comparison logs +method_comparison/MetaMathQA/cancelled_results/ +method_comparison/MetaMathQA/temporary_results/ diff --git a/peft/.pre-commit-config.yaml b/peft/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a430a35609297be7cdb65ff250dcbd4697fbc9be --- /dev/null +++ b/peft/.pre-commit-config.yaml @@ -0,0 +1,13 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.12.8 + hooks: + - id: ruff + args: + - --fix + - id: ruff-format + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-merge-conflict + - id: check-yaml diff --git a/peft/LICENSE b/peft/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/peft/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/peft/Makefile b/peft/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..70ba4b7f8ed560394c341a7e19b1c4238cee6505 --- /dev/null +++ b/peft/Makefile @@ -0,0 +1,66 @@ +.PHONY: quality style test docs + +check_dirs := src tests examples docs scripts docker + +# Check that source code meets quality standards + +# this target runs checks on all files +quality: + ruff check $(check_dirs) + ruff format --check $(check_dirs) + doc-builder style src/peft tests docs/source --max_len 119 --check_only + +# Format source code automatically and check is there are any problems left that need manual fixing +style: + ruff check --fix $(check_dirs) + ruff format $(check_dirs) + doc-builder style src/peft tests docs/source --max_len 119 + +test: + python -m pytest -n 3 tests/ $(if $(IS_GITHUB_CI),--report-log "ci_tests.log",) + +tests_examples_multi_gpu: + python -m pytest -m multi_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",) + +tests_examples_single_gpu: + python -m pytest -m single_gpu_tests tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",) + +tests_core_multi_gpu: + python -m pytest -m multi_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",) + +tests_core_single_gpu: + python -m pytest -m single_gpu_tests tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",) + +# exclude gemma tests, as generation fails with torch.compile, these failures +# trigger side effects that make other tests fail with 'RuntimeError: Offset +# increment outside graph capture encountered unexpectedly.' +# TODO re-enable gemma once/if it is fixed +tests_common_gpu: + python -m pytest tests/test_decoder_models.py -k "not gemma" $(if $(IS_GITHUB_CI),--report-log "common_decoder.log",) + python -m pytest tests/test_encoder_decoder_models.py $(if $(IS_GITHUB_CI),--report-log "common_encoder_decoder.log",) + python -m pytest tests/test_gptqmodel.py $(if $(IS_GITHUB_CI),--report-log "gptqmodel_gpu.log",) + +tests_examples_multi_gpu_bnb: + python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "multi_gpu_examples.log",) + +tests_examples_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py $(if $(IS_GITHUB_CI),--report-log "single_gpu_examples.log",) + +tests_core_multi_gpu_bnb: + python -m pytest -m "multi_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_multi_gpu.log",) + +tests_core_single_gpu_bnb: + python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py $(if $(IS_GITHUB_CI),--report-log "core_single_gpu.log",) + +tests_gpu_bnb_regression: + python -m pytest tests/bnb/test_bnb_regression.py $(if $(IS_GITHUB_CI),--report-log "bnb_regression_gpu.log",) + +# For testing transformers tests for bnb runners +transformers_tests: + RUN_SLOW=1 python -m pytest transformers-clone/tests/quantization/bnb $(if $(IS_GITHUB_CI),--report-log "transformers_tests.log",) + +tests_regression: + python -m pytest -s --regression tests/regression/ $(if $(IS_GITHUB_CI),--report-log "regression_tests.log",) + +tests_torch_compile: + python -m pytest tests/test_torch_compile.py $(if $(IS_GITHUB_CI),--report-log "compile_tests.log",) diff --git a/peft/README.md b/peft/README.md new file mode 100644 index 0000000000000000000000000000000000000000..77a61c68a30323357cd40bec0eff135597c9815c --- /dev/null +++ b/peft/README.md @@ -0,0 +1,189 @@ + + +

🤗 PEFT

+

+

State-of-the-art Parameter-Efficient Fine-Tuning (PEFT) methods

+

+ +Fine-tuning large pretrained models is often prohibitively costly due to their scale. Parameter-Efficient Fine-Tuning (PEFT) methods enable efficient adaptation of large pretrained models to various downstream applications by only fine-tuning a small number of (extra) model parameters instead of all the model's parameters. This significantly decreases the computational and storage costs. Recent state-of-the-art PEFT techniques achieve performance comparable to fully fine-tuned models. + +PEFT is integrated with Transformers for easy model training and inference, Diffusers for conveniently managing different adapters, and Accelerate for distributed training and inference for really big models. + +> [!TIP] +> Visit the [PEFT](https://huggingface.co/PEFT) organization to read about the PEFT methods implemented in the library and to see notebooks demonstrating how to apply these methods to a variety of downstream tasks. Click the "Watch repos" button on the organization page to be notified of newly implemented methods and notebooks! + +Check the PEFT Adapters API Reference section for a list of supported PEFT methods, and read the [Adapters](https://huggingface.co/docs/peft/en/conceptual_guides/adapter), [Soft prompts](https://huggingface.co/docs/peft/en/conceptual_guides/prompting), and [IA3](https://huggingface.co/docs/peft/en/conceptual_guides/ia3) conceptual guides to learn more about how these methods work. + +## Quickstart + +Install PEFT from pip: + +```bash +pip install peft +``` + +Prepare a model for training with a PEFT method such as LoRA by wrapping the base model and PEFT configuration with `get_peft_model`. For the bigscience/mt0-large model, you're only training 0.19% of the parameters! + +```python +from transformers import AutoModelForCausalLM +from peft import LoraConfig, TaskType, get_peft_model + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +model_id = "Qwen/Qwen2.5-3B-Instruct" +model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device) +peft_config = LoraConfig( + r=16, + lora_alpha=32, + task_type=TaskType.CAUSAL_LM, + # target_modules=["q_proj", "v_proj", ...] # optionally indicate target modules +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +# prints: trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.1193 + +# now perform training on your dataset, e.g. using transformers Trainer, then save the model +model.save_pretrained("qwen2.5-3b-lora") +``` + +To load a PEFT model for inference: + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import PeftModel + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +model_id = "Qwen/Qwen2.5-3B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device) +model = PeftModel.from_pretrained(model, "qwen2.5-3b-lora") + +inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt") +outputs = model.generate(**inputs.to(device), max_new_tokens=50) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + +# prints something like: Preheat the oven to 350 degrees and place the cookie dough in a baking dish [...] +``` + +## Why you should use PEFT + +There are many benefits of using PEFT but the main one is the huge savings in compute and storage, making PEFT applicable to many different use cases. + +### High performance on consumer hardware + +Consider the memory requirements for training the following models on the [ought/raft/twitter_complaints](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) dataset with an A100 80GB GPU with more than 64GB of CPU RAM. + +| Model | Full Finetuning | PEFT-LoRA PyTorch | PEFT-LoRA DeepSpeed with CPU Offloading | +| --------- | ---- | ---- | ---- | +| bigscience/T0_3B (3B params) | 47.14GB GPU / 2.96GB CPU | 14.4GB GPU / 2.96GB CPU | 9.8GB GPU / 17.8GB CPU | +| bigscience/mt0-xxl (12B params) | OOM GPU | 56GB GPU / 3GB CPU | 22GB GPU / 52GB CPU | +| bigscience/bloomz-7b1 (7B params) | OOM GPU | 32GB GPU / 3.8GB CPU | 18.1GB GPU / 35GB CPU | + +With LoRA you can fully finetune a 12B parameter model that would've otherwise run out of memory on the 80GB GPU, and comfortably fit and train a 3B parameter model. When you look at the 3B parameter model's performance, it is comparable to a fully finetuned model at a fraction of the GPU memory. + +| Submission Name | Accuracy | +| --------- | ---- | +| Human baseline (crowdsourced) | 0.897 | +| Flan-T5 | 0.892 | +| lora-t0-3b | 0.863 | + +> [!TIP] +> The bigscience/T0_3B model performance isn't optimized in the table above. You can squeeze even more performance out of it by playing around with the input instruction templates, LoRA hyperparameters, and other training related hyperparameters. The final checkpoint size of this model is just 19MB compared to 11GB of the full bigscience/T0_3B model. Learn more about the advantages of finetuning with PEFT in this [blog post](https://www.philschmid.de/fine-tune-flan-t5-peft). + +### Quantization + +Quantization is another method for reducing the memory requirements of a model by representing the data in a lower precision. It can be combined with PEFT methods to make it even easier to train and load LLMs for inference. + +* Learn how to finetune [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) with QLoRA and the [TRL](https://huggingface.co/docs/trl/index) library on a 16GB GPU in the [Finetune LLMs on your own consumer hardware using tools from PyTorch and Hugging Face ecosystem](https://pytorch.org/blog/finetune-llms/) blog post. +* Learn how to finetune a [openai/whisper-large-v2](https://huggingface.co/openai/whisper-large-v2) model for multilingual automatic speech recognition with LoRA and 8-bit quantization in this [notebook](https://colab.research.google.com/drive/1DOkD_5OUjFa0r5Ik3SgywJLJtEo2qLxO?usp=sharing) (see this [notebook](https://colab.research.google.com/drive/1vhF8yueFqha3Y3CpTHN6q9EVcII9EYzs?usp=sharing) instead for an example of streaming a dataset). + +### Save compute and storage + +PEFT can help you save storage by avoiding full finetuning of models on each of downstream task or dataset. In many cases, you're only finetuning a very small fraction of a model's parameters and each checkpoint is only a few MBs in size (instead of GBs). These smaller PEFT adapters demonstrate performance comparable to a fully finetuned model. If you have many datasets, you can save a lot of storage with a PEFT model and not have to worry about catastrophic forgetting or overfitting the backbone or base model. + +## PEFT integrations + +PEFT is widely supported across the Hugging Face ecosystem because of the massive efficiency it brings to training and inference. + +### Diffusers + +The iterative diffusion process consumes a lot of memory which can make it difficult to train. PEFT can help reduce the memory requirements and reduce the storage size of the final model checkpoint. For example, consider the memory required for training a Stable Diffusion model with LoRA on an A100 80GB GPU with more than 64GB of CPU RAM. The final model checkpoint size is only 8.8MB! + +| Model | Full Finetuning | PEFT-LoRA | PEFT-LoRA with Gradient Checkpointing | +| --------- | ---- | ---- | ---- | +| CompVis/stable-diffusion-v1-4 | 27.5GB GPU / 3.97GB CPU | 15.5GB GPU / 3.84GB CPU | 8.12GB GPU / 3.77GB CPU | + +> [!TIP] +> Take a look at the [examples/lora_dreambooth/train_dreambooth.py](examples/lora_dreambooth/train_dreambooth.py) training script to try training your own Stable Diffusion model with LoRA, and play around with the [smangrul/peft-lora-sd-dreambooth](https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth) Space which is running on a T4 instance. Learn more about the PEFT integration in Diffusers in this [tutorial](https://huggingface.co/docs/peft/main/en/tutorial/peft_integrations#diffusers). + +### Transformers + +PEFT is directly integrated with [Transformers](https://huggingface.co/docs/transformers/main/en/peft). After loading a model, call `add_adapter` to add a new PEFT adapter to the model: + +```python +from peft import LoraConfig +model = ... # transformers model +peft_config = LoraConfig(...) +model.add_adapter(lora_config, adapter_name="lora_1") +``` + +To load a trained PEFT adapter, call `load_adapter`: + +```python +model = ... # transformers model +model.load_adapter(, adapter_name="lora_1") +``` + +And to switch between different adapters, call `set_adapter`: + +```python +model.set_adapter("lora_2") +``` + +The Transformers integration doesn't include all the functionalities offered in PEFT, such as methods for merging the adapter into the base model. + +### Accelerate + +[Accelerate](https://huggingface.co/docs/accelerate/index) is a library for distributed training and inference on various training setups and hardware (GPUs, TPUs, Apple Silicon, etc.). PEFT models work with Accelerate out of the box, making it really convenient to train really large models or use them for inference on consumer hardware with limited resources. + +### TRL + +PEFT can also be applied to training LLMs with RLHF components such as the ranker and policy. Get started by reading: + +* [Fine-tune a Mistral-7b model with Direct Preference Optimization](https://towardsdatascience.com/fine-tune-a-mistral-7b-model-with-direct-preference-optimization-708042745aac) with PEFT and the [TRL](https://huggingface.co/docs/trl/index) library to learn more about the Direct Preference Optimization (DPO) method and how to apply it to a LLM. +* [Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU](https://huggingface.co/blog/trl-peft) with PEFT and the [TRL](https://huggingface.co/docs/trl/index) library, and then try out the [gpt2-sentiment_peft.ipynb](https://github.com/huggingface/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb) notebook to optimize GPT2 to generate positive movie reviews. +* [StackLLaMA: A hands-on guide to train LLaMA with RLHF](https://huggingface.co/blog/stackllama) with PEFT, and then try out the [stack_llama/scripts](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama/scripts) for supervised finetuning, reward modeling, and RL finetuning. + +## Model support + +Use this [Space](https://stevhliu-peft-methods.hf.space) or check out the [docs](https://huggingface.co/docs/peft/main/en/index) to find which models officially support a PEFT method out of the box. Even if you don't see a model listed below, you can manually configure the model config to enable PEFT for a model. Read the [New transformers architecture](https://huggingface.co/docs/peft/main/en/developer_guides/custom_models#new-transformers-architectures) guide to learn how. + +## Contribute + +If you would like to contribute to PEFT, please check out our [contribution guide](https://huggingface.co/docs/peft/developer_guides/contributing). + +## Citing 🤗 PEFT + +To use 🤗 PEFT in your publication, please cite it by using the following BibTeX entry. + +```bibtex +@Misc{peft, + title = {{PEFT}: State-of-the-art Parameter-Efficient Fine-Tuning methods}, + author = {Sourab Mangrulkar and Sylvain Gugger and Lysandre Debut and Younes Belkada and Sayak Paul and Benjamin Bossan}, + howpublished = {\url{https://github.com/huggingface/peft}}, + year = {2022} +} +``` diff --git a/peft/docker/README.md b/peft/docker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..193f11b75694e4272ed32434d6f0d0d7fba79c93 --- /dev/null +++ b/peft/docker/README.md @@ -0,0 +1,8 @@ +# PEFT Docker images + +Here we store all PEFT Docker images used in our testing infrastructure. We use python 3.11 for now on all our images. + +- `peft-cpu`: PEFT compiled on CPU with all other HF libraries installed on main branch +- `peft-gpu`: PEFT complied for NVIDIA GPUs with all other HF libraries installed on main branch +- `peft-gpu-bnb-source`: PEFT complied for NVIDIA GPUs with `bitsandbytes` and all other HF libraries installed from main branch +- `peft-gpu-bnb-latest`: PEFT complied for NVIDIA GPUs with `bitsandbytes` complied from main and all other HF libraries installed from latest PyPi diff --git a/peft/docker/peft-cpu/Dockerfile b/peft/docker/peft-cpu/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..117c03e724e02bb235e123db2207341c8b5e47db --- /dev/null +++ b/peft/docker/peft-cpu/Dockerfile @@ -0,0 +1,52 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.11 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + + +# Install audio-related libraries +RUN apt-get update && \ + apt install -y ffmpeg + +RUN apt install -y libsndfile1-dev +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip +RUN python3 -m pip install --no-cache-dir --upgrade pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] +# Activate the conda env and install transformers + accelerate from source +RUN source activate peft && \ + python3 -m pip install --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + git+https://github.com/huggingface/transformers \ + git+https://github.com/huggingface/accelerate \ + peft[test]@git+https://github.com/huggingface/peft + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] diff --git a/peft/docker/peft-gpu-bnb-latest/Dockerfile b/peft/docker/peft-gpu-bnb-latest/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c8265ac5e2094be91c51585f64660a6062504a9f --- /dev/null +++ b/peft/docker/peft-gpu-bnb-latest/Dockerfile @@ -0,0 +1,68 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.11 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Install audio-related libraries +RUN apt-get update && \ + apt install -y ffmpeg + +RUN apt install -y libsndfile1-dev +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip +RUN python3 -m pip install --no-cache-dir --upgrade pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Stage 2 +FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget cmake && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Activate the conda env and install transformers + accelerate from latest pypi +# Also clone BNB and build it from source. +RUN source activate peft && \ + python3 -m pip install -U --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + transformers \ + accelerate \ + peft \ + optimum \ + auto-gptq && \ + git clone https://github.com/bitsandbytes-foundation/bitsandbytes && cd bitsandbytes && \ + cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \ + cmake --build . && \ + pip install -e . && \ + pip freeze | grep bitsandbytes + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] diff --git a/peft/docker/peft-gpu-bnb-source/Dockerfile b/peft/docker/peft-gpu-bnb-source/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..afaa619387d846b480ec26317a49f52b70571a09 --- /dev/null +++ b/peft/docker/peft-gpu-bnb-source/Dockerfile @@ -0,0 +1,68 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.11 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Install audio-related libraries +RUN apt-get update && \ + apt install -y ffmpeg + +RUN apt install -y libsndfile1-dev +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip +RUN python3 -m pip install --no-cache-dir --upgrade pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Stage 2 +FROM nvidia/cuda:12.6.3-devel-ubuntu22.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget cmake && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +# Activate the conda env and install transformers + accelerate from source +# Also clone BNB and build it from source. +RUN source activate peft && \ + python3 -m pip install -U --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + git+https://github.com/huggingface/transformers \ + git+https://github.com/huggingface/accelerate \ + peft[test]@git+https://github.com/huggingface/peft \ + optimum \ + auto-gptq && \ + git clone https://github.com/bitsandbytes-foundation/bitsandbytes && cd bitsandbytes && \ + cmake -B . -DCOMPUTE_BACKEND=cuda -S . && \ + cmake --build . && \ + pip install -e . && \ + pip freeze | grep bitsandbytes + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] diff --git a/peft/docker/peft-gpu/Dockerfile b/peft/docker/peft-gpu/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..1a8230e68a8f325a110a99fbc8b78c0476db5d08 --- /dev/null +++ b/peft/docker/peft-gpu/Dockerfile @@ -0,0 +1,70 @@ +# Builds GPU docker image of PyTorch +# Uses multi-staged approach to reduce size +# Stage 1 +# Use base conda image to reduce time +FROM continuumio/miniconda3:latest AS compile-image +# Specify py version +ENV PYTHON_VERSION=3.11 +# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# Install audio-related libraries +RUN apt-get update && \ + apt-get install -y curl git wget software-properties-common git-lfs ffmpeg libsndfile1-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +RUN git lfs install + +# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +RUN conda create --name peft python=${PYTHON_VERSION} ipython jupyter pip + +# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile +# We don't install pytorch here yet since CUDA isn't available +# instead we use the direct torch wheel +ENV PATH /opt/conda/envs/peft/bin:$PATH +# Activate our bash shell +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] + +# Stage 2 +FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS build-image +COPY --from=compile-image /opt/conda /opt/conda +ENV PATH /opt/conda/bin:$PATH + +# Install apt libs +RUN apt-get update && \ + apt-get install -y curl git wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +RUN chsh -s /bin/bash +SHELL ["/bin/bash", "-c"] +RUN source activate peft && \ + python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq && \ + # Add autoawq for quantization testing + python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl && \ + python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ_kernels/releases/download/v0.0.9/autoawq_kernels-0.0.9-cp311-cp311-linux_x86_64.whl && \ + # Add eetq for quantization testing + python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git + +# Activate the conda env and install transformers + accelerate from source +RUN source activate peft && \ + python3 -m pip install -U --no-cache-dir \ + librosa \ + "soundfile>=0.12.1" \ + scipy \ + torchao \ + git+https://github.com/huggingface/transformers \ + git+https://github.com/huggingface/accelerate \ + peft[test]@git+https://github.com/huggingface/peft \ + # Add aqlm for quantization testing + aqlm[gpu]>=1.0.2 \ + # Add HQQ for quantization testing + hqq + +RUN source activate peft && \ + pip freeze | grep transformers + +RUN echo "source activate peft" >> ~/.profile + +# Activate the virtualenv +CMD ["/bin/bash"] diff --git a/peft/docs/Makefile b/peft/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8879933e6cda150267451c9e7d07dd22b7b0d3f1 --- /dev/null +++ b/peft/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/peft/docs/README.md b/peft/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b76173a66db251756401a35e1dfef5ae69794bb --- /dev/null +++ b/peft/docs/README.md @@ -0,0 +1,267 @@ + + +# Generating the documentation + +To generate the documentation, you first have to build it. Several packages are necessary to build the doc, +you can install them with the following command, at the root of the code repository: + +```bash +pip install -e ".[docs]" +``` + +Then you need to install our special tool that builds the documentation: + +```bash +pip install git+https://github.com/huggingface/doc-builder +``` + +--- +**NOTE** + +You only need to generate the documentation to inspect it locally (if you're planning changes and want to +check how they look before committing for instance). You don't have to commit to the built documentation. + +--- + +## Building the documentation + +Once you have setup the `doc-builder` and additional packages, you can generate the documentation by +typing the following command: + +```bash +doc-builder build peft docs/source/ --build_dir ~/tmp/test-build +``` + +You can adapt the `--build_dir` to set any temporary folder you prefer. This command will create it and generate +the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite +Markdown editor. + +## Previewing the documentation + +To preview the docs, first install the `watchdog` module with: + +```bash +pip install watchdog +``` + +Then run the following command: + +```bash +doc-builder preview {package_name} {path_to_docs} +``` + +For example: + +```bash +doc-builder preview peft docs/source +``` + +The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives. + +--- +**NOTE** + +The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again). + +--- + +## Adding a new element to the navigation bar + +Accepted files are Markdown (.md or .mdx). + +Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting +the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/peft/blob/main/docs/source/_toctree.yml) file. + +## Renaming section headers and moving sections + +It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information. + +Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor. + +So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file: + +``` +Sections that were moved: + +[ Section A ] +``` +and of course, if you moved it to another file, then: + +``` +Sections that were moved: + +[ Section A ] +``` + +Use the relative style to link to the new file so that the versioned docs continue to work. + + +## Writing Documentation - Specification + +The `huggingface/peft` documentation follows the +[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings, +although we can write them directly in Markdown. + +### Adding a new tutorial + +Adding a new tutorial or section is done in two steps: + +- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md). +- Link that file in `./source/_toctree.yml` on the correct toc-tree. + +Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so +depending on the intended targets (beginners, more advanced users, or researchers) it should go into sections two, three, or +four. + +### Writing source documentation + +Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names +and objects like True, None, or any strings should usually be put in `code`. + +When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool +adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or +function to be in the main package. + +If you want to create a link to some internal class or function, you need to +provide its path. For instance: \[\`utils.gather\`\]. This will be converted into a link with +`utils.gather` in the description. To get rid of the path and only keep the name of the object you are +linking to in the description, add a ~: \[\`~utils.gather\`\] will generate a link with `gather` in the description. + +The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\]. + +#### Defining arguments in a method + +Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and +an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its +description: + +``` + Args: + n_layers (`int`): The number of layers of the model. +``` + +If the description is too long to fit in one line (more than 119 characters in total), another indentation is necessary +before writing the description after the argument. + +Finally, to maintain uniformity if any *one* description is too long to fit on one line, the +rest of the parameters should follow suit and have an indention before their description. + +Here's an example showcasing everything so far: + +``` + Args: + gradient_accumulation_steps (`int`, *optional*, default to 1): + The number of steps that should pass before gradients are accumulated. A number > 1 should be combined with `Accelerator.accumulate`. + cpu (`bool`, *optional*): + Whether or not to force the script to execute on CPU. Will ignore GPU available if set to `True` and force the execution on one process only. +``` + +For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the +following signature: + +``` +def my_function(x: str = None, a: float = 1): +``` + +then its documentation should look like this: + +``` + Args: + x (`str`, *optional*): + This argument controls ... and has a description longer than 119 chars. + a (`float`, *optional*, defaults to 1): + This argument is used to ... and has a description longer than 119 chars. +``` + +Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even +if the first line describing your argument type and its default gets long, you can't break it into several lines. You can +however write as many lines as you want in the indented description (see the example above with `input_ids`). + +#### Writing a multi-line code block + +Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown: + + +```` +```python +# first line of code +# second line +# etc +``` +```` + +#### Writing a return block + +The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation. +The first line should be the type of the return, followed by a line return. No need to indent further for the elements +building the return. + +Here's an example of a single value return: + +``` + Returns: + `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token. +``` + +Here's an example of a tuple return, comprising several objects: + +``` + Returns: + `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs: + - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` -- + Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) -- + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). +``` + +## Styling the docstring + +We have an automatic script running with the `make style` comment that will make sure that: +- the docstrings fully take advantage of the line width +- all code examples are formatted using black, like the code of the Transformers library + +This script may have some weird failures if you make a syntax mistake or if you uncover a bug. Therefore, it's +recommended to commit your changes before running `make style`, so you can revert the changes done by that script +easily. + +## Writing documentation examples + +The syntax, for example, docstrings can look as follows: + +``` + Example: + + ```python + >>> import time + >>> from accelerate import Accelerator + >>> accelerator = Accelerator() + >>> if accelerator.is_main_process: + ... time.sleep(2) + >>> else: + ... print("I'm waiting for the main process to finish its sleep...") + >>> accelerator.wait_for_everyone() + >>> # Should print on every process at the same time + >>> print("Everyone is here") + ``` +``` + +The docstring should give a minimal, clear example of how the respective function +is to be used in inference and also include the expected (ideally sensible) +output. +Often, readers will try out the example before even going through the function +or class definitions. Therefore, it is of utmost importance that the example +works as expected. diff --git a/peft/docs/source/_config.py b/peft/docs/source/_config.py new file mode 100644 index 0000000000000000000000000000000000000000..2974756ce8d5964a46caf9190b6cd0a207cc2fa0 --- /dev/null +++ b/peft/docs/source/_config.py @@ -0,0 +1,7 @@ +# docstyle-ignore +INSTALL_CONTENT = """ +# PEFT installation +! pip install peft accelerate transformers +# To install from source instead of the last release, comment the command above and uncomment the following one. +# ! pip install git+https://github.com/huggingface/peft.git +""" diff --git a/peft/docs/source/_toctree.yml b/peft/docs/source/_toctree.yml new file mode 100644 index 0000000000000000000000000000000000000000..f46a4a67b8879289589658b734948ef16f56b824 --- /dev/null +++ b/peft/docs/source/_toctree.yml @@ -0,0 +1,151 @@ +- title: Get started + sections: + - local: index + title: 🤗 PEFT + - local: quicktour + title: Quicktour + - local: install + title: Installation + +- title: Tutorial + sections: + - local: tutorial/peft_model_config + title: Configurations and models + - local: tutorial/peft_integrations + title: Integrations + +- title: PEFT method guides + sections: + - local: task_guides/prompt_based_methods + title: Prompt-based methods + - local: task_guides/lora_based_methods + title: LoRA methods + - local: task_guides/ia3 + title: IA3 + +- title: Developer guides + sections: + - local: developer_guides/model_merging + title: Model merging + - local: developer_guides/quantization + title: Quantization + - local: developer_guides/lora + title: LoRA + - local: developer_guides/custom_models + title: Custom models + - local: developer_guides/low_level_api + title: Adapter injection + - local: developer_guides/mixed_models + title: Mixed adapter types + - local: developer_guides/torch_compile + title: torch.compile + - local: developer_guides/contributing + title: Contribute to PEFT + - local: developer_guides/troubleshooting + title: Troubleshooting + - local: developer_guides/checkpoint + title: PEFT checkpoint format + +- title: 🤗 Accelerate integrations + sections: + - local: accelerate/deepspeed + title: DeepSpeed + - local: accelerate/fsdp + title: Fully Sharded Data Parallel + +- title: Conceptual guides + sections: + - local: conceptual_guides/adapter + title: Adapters + - local: conceptual_guides/prompting + title: Soft prompts + - local: conceptual_guides/ia3 + title: IA3 + - local: conceptual_guides/oft + title: OFT/BOFT + +- sections: + - sections: + - local: package_reference/auto_class + title: AutoPeftModel + - local: package_reference/peft_model + title: PEFT model + - local: package_reference/peft_types + title: PEFT types + - local: package_reference/config + title: Configuration + - local: package_reference/tuners + title: Tuner + title: Main classes + - sections: + - local: package_reference/adalora + title: AdaLoRA + - local: package_reference/ia3 + title: IA3 + - local: package_reference/llama_adapter + title: Llama-Adapter + - local: package_reference/loha + title: LoHa + - local: package_reference/lokr + title: LoKr + - local: package_reference/lora + title: LoRA + - local: package_reference/xlora + title: X-LoRA + - local: package_reference/adapter_utils + title: LyCORIS + - local: package_reference/multitask_prompt_tuning + title: Multitask Prompt Tuning + - local: package_reference/oft + title: OFT + - local: package_reference/boft + title: BOFT + - local: package_reference/poly + title: Polytropon + - local: package_reference/p_tuning + title: P-tuning + - local: package_reference/prefix_tuning + title: Prefix tuning + - local: package_reference/prompt_tuning + title: Prompt tuning + - local: package_reference/layernorm_tuning + title: Layernorm tuning + - local: package_reference/vera + title: VeRA + - local: package_reference/fourierft + title: FourierFT + - local: package_reference/vblora + title: VB-LoRA + - local: package_reference/hra + title: HRA + - local: package_reference/cpt + title: CPT + - local: package_reference/bone + title: Bone + - local: package_reference/trainable_tokens + title: Trainable Tokens + - local: package_reference/randlora + title: RandLora + - local: package_reference/shira + title: SHiRA + - local: package_reference/c3a + title: C3A + - local: package_reference/miss + title: MiSS + - local: package_reference/road + title: RoAd + - local: package_reference/waveft + title: WaveFT + + title: Adapters + - sections: + - local: package_reference/merge_utils + title: Model merge + - local: package_reference/helpers + title: Helpers + - local: package_reference/hotswap + title: Hotswapping adapters + - local: package_reference/functional + title: Functions for PEFT integration + title: Utilities + title: API reference diff --git a/peft/docs/source/accelerate/deepspeed.md b/peft/docs/source/accelerate/deepspeed.md new file mode 100644 index 0000000000000000000000000000000000000000..775987a9e9bd600a5b8a7c8354067c80c2ab4fd9 --- /dev/null +++ b/peft/docs/source/accelerate/deepspeed.md @@ -0,0 +1,449 @@ + + +# DeepSpeed + +[DeepSpeed](https://www.deepspeed.ai/) is a library designed for speed and scale for distributed training of large models with billions of parameters. At its core is the Zero Redundancy Optimizer (ZeRO) that shards optimizer states (ZeRO-1), gradients (ZeRO-2), and parameters (ZeRO-3) across data parallel processes. This drastically reduces memory usage, allowing you to scale your training to billion parameter models. To unlock even more memory efficiency, ZeRO-Offload reduces GPU compute and memory by leveraging CPU resources during optimization. + +Both of these features are supported in 🤗 Accelerate, and you can use them with 🤗 PEFT. + +## Compatibility with `bitsandbytes` quantization + LoRA + +Below is a table that summarizes the compatibility between PEFT's LoRA, [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library and DeepSpeed Zero stages with respect to fine-tuning. DeepSpeed Zero-1 and 2 will have no effect at inference as stage 1 shards the optimizer states and stage 2 shards the optimizer states and gradients: + +| DeepSpeed stage | Is compatible? | +|---|---| +| Zero-1 | 🟢 | +| Zero-2 | 🟢 | +| Zero-3 | 🟢 | + +For DeepSpeed Stage 3 + QLoRA, please refer to the section [Use PEFT QLoRA and DeepSpeed with ZeRO3 for finetuning large models on multiple GPUs](#use-peft-qlora-and-deepspeed-with-zero3-for-finetuning-large-models-on-multiple-gpus) below. + +For confirming these observations, we ran the SFT (Supervised Fine-tuning) [offical example scripts](https://github.com/huggingface/trl/tree/main/examples) of the [Transformers Reinforcement Learning (TRL) library](https://github.com/huggingface/trl) using QLoRA + PEFT and the accelerate configs available [here](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs). We ran these experiments on a 2x NVIDIA T4 GPU. + +# Use PEFT and DeepSpeed with ZeRO3 for finetuning large models on multiple devices and multiple nodes + +This section of guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/sft/train.py) for performing SFT. You'll configure the script to do SFT (supervised fine-tuning) of Llama-70B model with LoRA and ZeRO-3 on 8xH100 80GB GPUs on a single machine. You can configure it to scale to multiple machines by changing the accelerate config. + +## Configuration + +Start by running the following command to [create a DeepSpeed configuration file](https://huggingface.co/docs/accelerate/quicktour#launching-your-distributed-script) with 🤗 Accelerate. The `--config_file` flag allows you to save the configuration file to a specific location, otherwise it is saved as a `default_config.yaml` file in the 🤗 Accelerate cache. + +The configuration file is used to set the default options when you launch the training script. + +```bash +accelerate config --config_file deepspeed_config.yaml +``` + +You'll be asked a few questions about your setup, and configure the following arguments. In this example, you'll use ZeRO-3 so make sure you pick those options. + +```bash +`zero_stage`: [0] Disabled, [1] optimizer state partitioning, [2] optimizer+gradient state partitioning and [3] optimizer+gradient+parameter partitioning +`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them. Pass the same value as you would pass via cmd argument else you will encounter mismatch error. +`gradient_clipping`: Enable gradient clipping with value. Don't set this as you will be passing it via cmd arguments. +`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2. Set this as `none` as don't want to enable offloading. +`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3. Set this as `none` as don't want to enable offloading. +`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3. Set this to `True`. +`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3. Set this to `True`. +`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training. Set this to `True`. +``` + +Once this is done, the corresponding config should look like below and you can find it in config folder at [deepspeed_config.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config.yaml): + +```yml +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + gradient_accumulation_steps: 4 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +## Launch command + +The launch command is available at [run_peft_deepspeed.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_deepspeed.sh) and it is also shown below: +```bash +accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-lora-deepspeed" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization False +``` + +Notice that we are using LoRA with rank=8, alpha=16 and targeting all linear layers. We are passing the deepspeed config file and finetuning 70B Llama model on a subset of the ultrachat dataset. + +## The important parts + +Let's dive a little deeper into the script so you can see what's going on, and understand how it works. + +The first thing to know is that the script uses DeepSpeed for distributed training as the DeepSpeed config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating the PEFT model using the peft config that is passed. After that, when you call `trainer.train()`, [`~trl.SFTTrainer`] internally uses 🤗 Accelerate to prepare the model, optimizer and trainer using the DeepSpeed config to create DeepSpeed engine which is then trained. The main code snippet is below: + +```python +# trainer +trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=peft_config, +) +trainer.accelerator.print(f"{trainer.model}") + +# train +checkpoint = None +if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint +trainer.train(resume_from_checkpoint=checkpoint) + +# saving final model +trainer.save_model() +``` + +## Memory usage + +In the above example, the memory consumed per GPU is 64 GB (80%) as seen in the screenshot below: + +
+ +
+GPU memory usage for the training run + +## More resources +You can also refer this blog post [Falcon 180B Finetuning using 🤗 PEFT and DeepSpeed](https://medium.com/@sourabmangrulkar/falcon-180b-finetuning-using-peft-and-deepspeed-b92643091d99) on how to finetune 180B Falcon model on 16 A100 GPUs on 2 machines. + + +# Use PEFT QLoRA and DeepSpeed with ZeRO3 for finetuning large models on multiple GPUs + +In this section, we will look at how to use QLoRA and DeepSpeed Stage-3 for finetuning 70B llama model on 2X40GB GPUs. +For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `zero3_init_flag` to true when using Accelerate config. Below is the config which can be found at [deepspeed_config_z3_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/deepspeed_config_z3_qlora.yaml): + +```yml +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +Launch command is given below which is available at [run_peft_qlora_deepspeed_stage3.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_deepspeed_stage3.sh): +``` +accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-qlora-dsz3" \ +--per_device_train_batch_size 2 \ +--per_device_eval_batch_size 2 \ +--gradient_accumulation_steps 2 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--bnb_4bit_quant_storage_dtype "bfloat16" +``` + +Notice the new argument being passed `bnb_4bit_quant_storage_dtype` which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **32/4 = 8** 4-bit params are packed together post quantization. + +In terms of training code, the important code changes are: + +```diff +... + +bnb_config = BitsAndBytesConfig( + load_in_4bit=args.use_4bit_quantization, + bnb_4bit_quant_type=args.bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.use_nested_quant, ++ bnb_4bit_quant_storage=quant_storage_dtype, +) + +... + +model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + quantization_config=bnb_config, + trust_remote_code=True, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", ++ torch_dtype=quant_storage_dtype or torch.float32, +) +``` + +Notice that `torch_dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL. + +## Memory usage + +In the above example, the memory consumed per GPU is **36.6 GB**. Therefore, what took 8X80GB GPUs with DeepSpeed Stage 3+LoRA and a couple of 80GB GPUs with DDP+QLoRA now requires 2X40GB GPUs. This makes finetuning of large models more accessible. + +# Use PEFT and DeepSpeed with ZeRO3 and CPU Offloading for finetuning large models on a single GPU +This section of guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You'll configure the script to train a large model for conditional generation with ZeRO-3 and CPU Offload. + +> [!TIP] +> 💡 To help you get started, check out our example training scripts for [causal language modeling](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py) and [conditional generation](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py). You can adapt these scripts for your own applications or even use them out of the box if your task is similar to the one in the scripts. + +## Configuration + +Start by running the following command to [create a DeepSpeed configuration file](https://huggingface.co/docs/accelerate/quicktour#launching-your-distributed-script) with 🤗 Accelerate. The `--config_file` flag allows you to save the configuration file to a specific location, otherwise it is saved as a `default_config.yaml` file in the 🤗 Accelerate cache. + +The configuration file is used to set the default options when you launch the training script. + +```bash +accelerate config --config_file ds_zero3_cpu.yaml +``` + +You'll be asked a few questions about your setup, and configure the following arguments. In this example, you'll use ZeRO-3 along with CPU-Offload so make sure you pick those options. + +```bash +`zero_stage`: [0] Disabled, [1] optimizer state partitioning, [2] optimizer+gradient state partitioning and [3] optimizer+gradient+parameter partitioning +`gradient_accumulation_steps`: Number of training steps to accumulate gradients before averaging and applying them. +`gradient_clipping`: Enable gradient clipping with value. +`offload_optimizer_device`: [none] Disable optimizer offloading, [cpu] offload optimizer to CPU, [nvme] offload optimizer to NVMe SSD. Only applicable with ZeRO >= Stage-2. +`offload_param_device`: [none] Disable parameter offloading, [cpu] offload parameters to CPU, [nvme] offload parameters to NVMe SSD. Only applicable with ZeRO Stage-3. +`zero3_init_flag`: Decides whether to enable `deepspeed.zero.Init` for constructing massive models. Only applicable with ZeRO Stage-3. +`zero3_save_16bit_model`: Decides whether to save 16-bit model weights when using ZeRO Stage-3. +`mixed_precision`: `no` for FP32 training, `fp16` for FP16 mixed-precision training and `bf16` for BF16 mixed-precision training. +``` + +An example [configuration file](https://github.com/huggingface/peft/blob/main/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml) might look like the following. The most important thing to notice is that `zero_stage` is set to `3`, and `offload_optimizer_device` and `offload_param_device` are set to the `cpu`. + +```yml +compute_environment: LOCAL_MACHINE +deepspeed_config: + gradient_accumulation_steps: 1 + gradient_clipping: 1.0 + offload_optimizer_device: cpu + offload_param_device: cpu + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +dynamo_backend: 'NO' +fsdp_config: {} +machine_rank: 0 +main_training_function: main +megatron_lm_config: {} +mixed_precision: 'no' +num_machines: 1 +num_processes: 1 +rdzv_backend: static +same_network: true +use_cpu: false +``` + +## The important parts + +Let's dive a little deeper into the script so you can see what's going on, and understand how it works. + +Within the [`main`](https://github.com/huggingface/peft/blob/2822398fbe896f25d4dac5e468624dc5fd65a51b/examples/conditional_generation/peft_lora_seq2seq_accelerate_ds_zero3_offload.py#L103) function, the script creates an [`~accelerate.Accelerator`] class to initialize all the necessary requirements for distributed training. + +> [!TIP] +> 💡 Feel free to change the model and dataset inside the `main` function. If your dataset format is different from the one in the script, you may also need to write your own preprocessing function. + +The script also creates a configuration for the 🤗 PEFT method you're using, which in this case, is LoRA. The [`LoraConfig`] specifies the task type and important parameters such as the dimension of the low-rank matrices, the matrices scaling factor, and the dropout probability of the LoRA layers. If you want to use a different 🤗 PEFT method, make sure you replace `LoraConfig` with the appropriate [class](../package_reference/tuners). + +```diff + def main(): ++ accelerator = Accelerator() + model_name_or_path = "facebook/bart-large" + dataset_name = "twitter_complaints" ++ peft_config = LoraConfig( + task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 + ) +``` + +Throughout the script, you'll see the [`~accelerate.Accelerator.main_process_first`] and [`~accelerate.Accelerator.wait_for_everyone`] functions which help control and synchronize when processes are executed. + +The [`get_peft_model`] function takes a base model and the [`peft_config`] you prepared earlier to create a [`PeftModel`]: + +```diff + model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) ++ model = get_peft_model(model, peft_config) +``` + +Pass all the relevant training objects to 🤗 Accelerate's [`~accelerate.Accelerator.prepare`] which makes sure everything is ready for training: + +```py +model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare( + model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler +) +``` + +The next bit of code checks whether the DeepSpeed plugin is used in the `Accelerator`, and if the plugin exists, then we check if we are using ZeRO-3. This conditional flag is used when calling `generate` function call during inference for syncing GPUs when the model parameters are sharded: + +```py +is_ds_zero_3 = False +if getattr(accelerator.state, "deepspeed_plugin", None): + is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3 +``` + +Inside the training loop, the usual `loss.backward()` is replaced by 🤗 Accelerate's [`~accelerate.Accelerator.backward`] which uses the correct `backward()` method based on your configuration: + +```diff + for epoch in range(num_epochs): + with TorchTracemalloc() as tracemalloc: + model.train() + total_loss = 0 + for step, batch in enumerate(tqdm(train_dataloader)): + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() ++ accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() +``` + +That is all! The rest of the script handles the training loop, evaluation, and even pushes it to the Hub for you. + +## Train + +Run the following command to launch the training script. Earlier, you saved the configuration file to `ds_zero3_cpu.yaml`, so you'll need to pass the path to the launcher with the `--config_file` argument like this: + +```bash +accelerate launch --config_file ds_zero3_cpu.yaml examples/peft_lora_seq2seq_accelerate_ds_zero3_offload.py +``` + +You'll see some output logs that track memory usage during training, and once it's completed, the script returns the accuracy and compares the predictions to the labels: + +```bash +GPU Memory before entering the train : 1916 +GPU Memory consumed at the end of the train (end-begin): 66 +GPU Peak Memory consumed during the train (max-begin): 7488 +GPU Total Peak Memory consumed during the train (max): 9404 +CPU Memory before entering the train : 19411 +CPU Memory consumed at the end of the train (end-begin): 0 +CPU Peak Memory consumed during the train (max-begin): 0 +CPU Total Peak Memory consumed during the train (max): 19411 +epoch=4: train_ppl=tensor(1.0705, device='cuda:0') train_epoch_loss=tensor(0.0681, device='cuda:0') +100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:27<00:00, 3.92s/it] +GPU Memory before entering the eval : 1982 +GPU Memory consumed at the end of the eval (end-begin): -66 +GPU Peak Memory consumed during the eval (max-begin): 672 +GPU Total Peak Memory consumed during the eval (max): 2654 +CPU Memory before entering the eval : 19411 +CPU Memory consumed at the end of the eval (end-begin): 0 +CPU Peak Memory consumed during the eval (max-begin): 0 +CPU Total Peak Memory consumed during the eval (max): 19411 +accuracy=100.0 +eval_preds[:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint'] +dataset['train'][label_column][:10]=['no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint', 'no complaint', 'no complaint', 'complaint', 'complaint', 'no complaint'] +``` + +# Caveats +1. Merging when using PEFT and DeepSpeed is currently unsupported and will raise error. +2. When using CPU offloading, the major gains from using PEFT to shrink the optimizer states and gradients to that of the adapter weights would be realized on CPU RAM and there won't be savings with respect to GPU memory. +3. DeepSpeed Stage 3 and qlora when used with CPU offloading leads to more GPU memory usage when compared to disabling CPU offloading. + +> [!TIP] +> 💡 When you have code that requires merging (and unmerging) of weights, try to manually collect the parameters with DeepSpeed Zero-3 beforehand: +> +> ```python +> import deepspeed +> +> is_ds_zero_3 = ... # check if Zero-3 +> +> with deepspeed.zero.GatheredParameters(list(model.parameters()), enabled= is_ds_zero_3): +> model.merge_adapter() +> # do whatever is needed, then unmerge in the same context if unmerging is required +> ... +> model.unmerge_adapter() +> ``` diff --git a/peft/docs/source/accelerate/fsdp.md b/peft/docs/source/accelerate/fsdp.md new file mode 100644 index 0000000000000000000000000000000000000000..5ccec0c12c9d7bd767fca98bb5ce330a0d771108 --- /dev/null +++ b/peft/docs/source/accelerate/fsdp.md @@ -0,0 +1,285 @@ + + +# Fully Sharded Data Parallel + +[Fully sharded data parallel](https://pytorch.org/docs/stable/fsdp.html) (FSDP) is developed for distributed training of large pretrained models up to 1T parameters. FSDP achieves this by sharding the model parameters, gradients, and optimizer states across data parallel processes and it can also offload sharded model parameters to a CPU. The memory efficiency afforded by FSDP allows you to scale training to larger batch or model sizes. + +Both of these features are supported in 🤗 Accelerate, and you can use them with 🤗 PEFT. + +# Use PEFT and FSDP +This section of guide will help you learn how to use our DeepSpeed [training script](https://github.com/huggingface/peft/blob/main/examples/sft/train.py) for performing SFT. You'll configure the script to do SFT (supervised fine-tuning) of Llama-70B model with LoRA and FSDP on 8xH100 80GB GPUs on a single machine. You can configure it to scale to multiple machines by changing the accelerate config. + +## Configuration + +Start by running the following command to [create a FSDP configuration file](https://huggingface.co/docs/accelerate/quicktour#launching-your-distributed-script) with 🤗 Accelerate. The `--config_file` flag allows you to save the configuration file to a specific location, otherwise it is saved as a `default_config.yaml` file in the 🤗 Accelerate cache. + +The configuration file is used to set the default options when you launch the training script. + +```bash +accelerate config --config_file fsdp_config.yaml +``` + +You'll be asked a few questions about your setup, and configure the following arguments. In this example, you'll answer the questionnaire as shown in the image below. +
+ +
+Creating Accelerate's config to use FSDP + +Once this is done, the corresponding config should look like below and you can find it in config folder at [fsdp_config.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config.yaml): + +```yml +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +## Launch command + +The launch command is available at [run_peft_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_fsdp.sh) and it is also shown below: +```bash +accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-lora-fsdp" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization False +``` + +Notice that we are using LoRA with rank=8, alpha=16 and targeting all linear layers. We are passing the FSDP config file and finetuning the 70B Llama model on a subset of the [ultrachat dataset](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k). + +## The important parts + +Let's dive a little deeper into the script so you can see what's going on, and understand how it works. + +The first thing to know is that the script uses FSDP for distributed training as the FSDP config has been passed. The [`~trl.SFTTrainer`] class handles all the heavy lifting of creating PEFT model using the peft config that is passed. After that when you call `trainer.train()`, Trainer internally uses 🤗 Accelerate to prepare model, optimizer and trainer using the FSDP config to create FSDP wrapped model which is then trained. The main code snippet is below: + +```python +# trainer +trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=peft_config, +) +trainer.accelerator.print(f"{trainer.model}") +if model_args.use_peft_lora: + # handle PEFT+FSDP case + trainer.model.print_trainable_parameters() + if getattr(trainer.accelerator.state, "fsdp_plugin", None): + from peft.utils.other import fsdp_auto_wrap_policy + + fsdp_plugin = trainer.accelerator.state.fsdp_plugin + fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model) + +# train +checkpoint = None +if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint +trainer.train(resume_from_checkpoint=checkpoint) + +# saving final model +if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") +trainer.save_model() +``` + + +Here, one main thing to note currently when using FSDP with PEFT is that `use_orig_params` needs to be `False` to realize GPU memory savings. Due to `use_orig_params=False`, the auto wrap policy for FSDP needs to change so that trainable and non-trainable parameters are wrapped separately. This is done by the code snippt below which uses the util function `fsdp_auto_wrap_policy` from PEFT: + +``` +if getattr(trainer.accelerator.state, "fsdp_plugin", None): + from peft.utils.other import fsdp_auto_wrap_policy + + fsdp_plugin = trainer.accelerator.state.fsdp_plugin + fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(trainer.model) +``` + +## Memory usage + +In the above example, the memory consumed per GPU is 72-80 GB (90-98%) as seen in the screenshot below. The slight increase in GPU memory at the end is when saving the model using `FULL_STATE_DICT` state dict type instead of the `SHARDED_STATE_DICT` so that the model has adapter weights that can be loaded normally with `from_pretrained` method during inference: + +
+ +
+GPU memory usage for the training run + +# Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs + +In this section, we will look at how to use QLoRA and FSDP for finetuning 70B llama model on 2X24GB GPUs. [Answer.AI](https://www.answer.ai/) in collaboration with bitsandbytes and Hugging Face 🤗 open sourced code enabling the usage of FSDP+QLoRA and explained the whole process in their insightful blogpost [You can now train a 70b language model at home](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html). This is now integrated in Hugging Face ecosystem. + +For this, we first need `bitsandbytes>=0.43.3`, `accelerate>=1.0.1`, `transformers>4.44.2`, `trl>0.11.4` and `peft>0.13.0`. We need to set `fsdp_cpu_ram_efficient_loading=true`, `fsdp_use_orig_params=false` and `fsdp_offload_params=true`(cpu offloading) when using Accelerate config. When not using accelerate launcher, you can alternately set the environment variable `export FSDP_CPU_RAM_EFFICIENT_LOADING=true`. Here, we will be using accelerate config and below is the config which can be found at [fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml): + +```yml +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: true + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: 'no' +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +Launch command is given below which is available at [run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh): +``` +accelerate launch --config_file "configs/fsdp_config_qlora.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-qlora-fsdp" \ +--per_device_train_batch_size 2 \ +--per_device_eval_batch_size 2 \ +--gradient_accumulation_steps 2 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--bnb_4bit_quant_storage_dtype "bfloat16" +``` + +Notice the new argument being passed, `bnb_4bit_quant_storage_dtype`, which denotes the data type for packing the 4-bit parameters. For example, when it is set to `bfloat16`, **16/4 = 4** 4-bit params are packed together post quantization. When using mixed precision training with `bfloat16`, `bnb_4bit_quant_storage_dtype` can be either `bfloat16` for pure `bfloat16` finetuning, or `float32` for automatic mixed precision (this consumes more GPU memory). When using mixed precision training with `float16`, `bnb_4bit_quant_storage_dtype` should be set to `float32` for stable automatic mixed precision training. + +In terms of training code, the important code changes are: + +```diff +... + +bnb_config = BitsAndBytesConfig( + load_in_4bit=args.use_4bit_quantization, + bnb_4bit_quant_type=args.bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.use_nested_quant, ++ bnb_4bit_quant_storage=quant_storage_dtype, +) + +... + +model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + quantization_config=bnb_config, + trust_remote_code=True, + attn_implementation="flash_attention_2" if args.use_flash_attn else "eager", ++ torch_dtype=quant_storage_dtype or torch.float32, +) +``` + +Notice that `torch_dtype` for `AutoModelForCausalLM` is same as the `bnb_4bit_quant_storage` data type. That's it. Everything else is handled by Trainer and TRL. + +## Memory usage + +In the above example, the memory consumed per GPU is **19.6 GB** while CPU RAM usage is around **107 GB**. When disabling CPU offloading, the GPU memory usage is **35.6 GB/ GPU**. Therefore, what took 16X80GB GPUs for full finetuning, 8X80GB GPUs with FSDP+LoRA, and a couple of 80GB GPUs with DDP+QLoRA, now requires 2X24GB GPUs. This makes finetuning of large models more accessible. + +## More resources +You can also refer the [llama-recipes](https://github.com/facebookresearch/llama-recipes/?tab=readme-ov-file#fine-tuning) repo and [Getting started with Llama](https://llama.meta.com/get-started/#fine-tuning) guide on how to finetune using FSDP and PEFT. + +## Caveats +1. Merging when using PEFT and FSDP is currently unsupported and will raise error. +2. Passing `modules_to_save` config parameter to is untested at present. +3. GPU Memory saving when using CPU Offloading is untested at present. +4. When using FSDP+QLoRA, `paged_adamw_8bit` currently results in an error when saving a checkpoint. +5. DoRA training with FSDP should work (albeit at lower speed than LoRA). If combined with bitsandbytes (QDoRA), 4-bit quantization should also work, but 8-bit quantization has known issues and is not recommended. diff --git a/peft/docs/source/conceptual_guides/adapter.md b/peft/docs/source/conceptual_guides/adapter.md new file mode 100644 index 0000000000000000000000000000000000000000..ef905a6b975c57f2d308dc6f39892f4f2261bdec --- /dev/null +++ b/peft/docs/source/conceptual_guides/adapter.md @@ -0,0 +1,136 @@ + + +# Adapters + +Adapter-based methods add extra trainable parameters after the attention and fully-connected layers of a frozen pretrained model to reduce memory-usage and speed up training. The method varies depending on the adapter, it could simply be an extra added layer or it could be expressing the weight updates ∆W as a low-rank decomposition of the weight matrix. Either way, the adapters are typically small but demonstrate comparable performance to a fully finetuned model and enable training larger models with fewer resources. + +This guide will give you a brief overview of the adapter methods supported by PEFT (if you're interested in learning more details about a specific method, take a look at the linked paper). + +## Low-Rank Adaptation (LoRA) + +> [!TIP] +> LoRA is one of the most popular PEFT methods and a good starting point if you're just getting started with PEFT. It was originally developed for large language models but it is a tremendously popular training method for diffusion models because of its efficiency and effectiveness. + +As mentioned briefly earlier, [LoRA](https://hf.co/papers/2106.09685) is a technique that accelerates finetuning large models while consuming less memory. + +LoRA represents the weight updates ∆W with two smaller matrices (called *update matrices*) through low-rank decomposition. These new matrices can be trained to adapt to the new data while keeping the overall number of parameters low. The original weight matrix remains frozen and doesn't receive any further updates. To produce the final results, the original and extra adapted weights are combined. You could also merge the adapter weights with the base model to eliminate inference latency. + +
+ +
+ +This approach has a number of advantages: + +* LoRA makes finetuning more efficient by drastically reducing the number of trainable parameters. +* The original pretrained weights are kept frozen, which means you can have multiple lightweight and portable LoRA models for various downstream tasks built on top of them. +* LoRA is orthogonal to other parameter-efficient methods and can be combined with many of them. +* Performance of models finetuned using LoRA is comparable to the performance of fully finetuned models. + +In principle, LoRA can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. However, for simplicity and further parameter efficiency, LoRA is typically only applied to the attention blocks in Transformer models. The resulting number of trainable parameters in a LoRA model depends on the size of the update matrices, which is determined mainly by the rank `r` and the shape of the original weight matrix. + +
+ +
+Navigating Text-To-Image Customization: From LyCORIS Fine-Tuning to Model Evaluation + +## Mixture of LoRA Experts (X-LoRA) + +[X-LoRA](https://huggingface.co/papers/2402.07148) is a mixture of experts method for LoRA which works by using dense or sparse gating to dynamically activate LoRA experts. The LoRA experts as well as the base model are frozen during training, resulting in a low parameter count as only the gating layers must be trained. In particular, the gating layers output scalings which (depending on config) are granular on the layer and token level. Additionally, during inference, X-LoRA dynamically activates LoRA adapters to recall knowledge and effectively mix them: + +The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context. + +![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) + +For each step, X-LoRA requires the base model to be run twice: first, to get hidden states without any LoRA adapters, and secondly, the hidden states are used to calculate scalings which are applied to the LoRA adapters and the model is run a second time. The output of the second run is the result of the model step. + +Ultimately, X-LoRA allows the model to reflect upon its knowledge because of the dual forward pass scheme, and dynamically reconfigure the architecture. + +## Low-Rank Hadamard Product (LoHa) + +Low-rank decomposition can impact performance because the weight updates are limited to the low-rank space, which can constrain a model's expressiveness. However, you don't necessarily want to use a larger rank because it increases the number of trainable parameters. To address this, [LoHa](https://huggingface.co/papers/2108.06098) (a method originally developed for computer vision) was applied to diffusion models where the ability to generate diverse images is an important consideration. LoHa should also work with general model types, but the embedding layers aren't currently implemented in PEFT. + +LoHa uses the [Hadamard product](https://en.wikipedia.org/wiki/Hadamard_product_(matrices)) (element-wise product) instead of the matrix product. ∆W is represented by four smaller matrices instead of two - like in LoRA - and each pair of these low-rank matrices are combined with the Hadamard product. As a result, ∆W can have the same number of trainable parameters but a higher rank and expressivity. + +## Low-Rank Kronecker Product (LoKr) + +[LoKr](https://hf.co/papers/2309.14859) is very similar to LoRA and LoHa, and it is also mainly applied to diffusion models, though you could also use it with other model types. LoKr replaces the matrix product with the [Kronecker product](https://en.wikipedia.org/wiki/Kronecker_product) instead. The Kronecker product decomposition creates a block matrix which preserves the rank of the original weight matrix. Another benefit of the Kronecker product is that it can be vectorized by stacking the matrix columns. This can speed up the process because you're avoiding fully reconstructing ∆W. + +## Orthogonal Finetuning (OFT) + +
+ +
+Controlling Text-to-Image Diffusion by Orthogonal Finetuning + +[OFT](https://hf.co/papers/2306.07280) is a method that primarily focuses on preserving a pretrained model's generative performance in the finetuned model. It tries to maintain the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer because this better captures the semantic information among neurons. This means OFT is more capable at preserving the subject and it is better for controllable generation (similar to [ControlNet](https://huggingface.co/docs/diffusers/using-diffusers/controlnet)). + +OFT preserves the hyperspherical energy by learning an orthogonal transformation for neurons to keep the cosine similarity between them unchanged. In practice, this means taking the matrix product of an orthogonal matrix with the pretrained weight matrix. However, to be parameter-efficient, the orthogonal matrix is represented as a block-diagonal matrix with rank `r` blocks. Whereas LoRA reduces the number of trainable parameters with low-rank structures, OFT reduces the number of trainable parameters with a sparse block-diagonal matrix structure. + +## Orthogonal Butterfly (BOFT) + +[BOFT](https://hf.co/papers/2311.06243) is an improved orthogonal finetuning method that focuses on preserving a pretrained model's generative capabilities while being significantly more parameter-efficient than standard OFT. Like OFT, BOFT maintains the same cosine similarity (hyperspherical energy) between all pairwise neurons in a layer by applying an orthogonal transformation to the pretrained weight matrix, ensuring the semantic relationships among neurons are preserved. + +Instead of using a block-diagonal orthogonal matrix, BOFT factorizes the orthogonal transformation into a product of **sparse butterfly matrices** (originally introduced in the [Cooley–Tukey FFT](https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm)). Unlike OFT's block-diagonal rotations, which only mix inputs within each block, the butterfly structure guarantees that every input can influence every output, producing a **dense connectivity** with just `O(d log d)` parameters. This factorization preserves expressivity while drastically reducing the parameter count compared to OFT (at the expense of computation time). + +In practice, BOFT multiplies each pretrained weight matrix by a sequence of butterfly-structured orthogonal factors, enabling efficient and expressive neuron rotations. This makes BOFT well-suited for controllable generation and tasks where maintaining the pretrained model's subject representation is critical, while also scaling to larger models with lower memory and compute overhead. + +## Adaptive Low-Rank Adaptation (AdaLoRA) + +[AdaLoRA](https://hf.co/papers/2303.10512) manages the parameter budget introduced from LoRA by allocating more parameters - in other words, a higher rank `r` - for important weight matrices that are better adapted for a task and pruning less important ones. The rank is controlled by a method similar to singular value decomposition (SVD). The ∆W is parameterized with two orthogonal matrices and a diagonal matrix which contains singular values. This parametrization method avoids iteratively applying SVD which is computationally expensive. Based on this method, the rank of ∆W is adjusted according to an importance score. ∆W is divided into triplets and each triplet is scored according to its contribution to model performance. Triplets with low importance scores are pruned and triplets with high importance scores are kept for finetuning. + +Training with AdaLoRA has three phases: the init phase, the budgeting phase and the final phase. In the initial phase, no budgeting is applied, therefore the ranks are not touched. During the budgeting phase the process described above is applied and the rank is redistributed according to a budget, aiming to give more important adapters more rank and less important layers less. When reaching the final phase, budgeting has ended, the ranks are redistributed but we may continue training for a while with the redistributed ranks to further improve performance. + +## Llama-Adapter + +[Llama-Adapter](https://hf.co/papers/2303.16199) is a method for adapting Llama into an instruction-following model. To help adapt the model for instruction-following, the adapter is trained with a 52K instruction-output dataset. + +A set of learnable adaption prompts are prefixed to the input instruction tokens. These are inserted into the upper layers of the model because it is better to learn with the higher-level semantics of the pretrained model. The instruction-output tokens prefixed to the input guide the adaption prompt to generate a contextual response. + +
+ +
+LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention + +To avoid adding noise to the tokens, the adapter uses zero-initialized attention. On top of this, the adapter adds a learnable gating factor (initialized with zeros) to progressively add information to the model during training. This prevents overwhelming the model's pretrained knowledge with the newly learned instructions. + +## Householder Reflection Adaptation (HRA) + +[HRA](https://huggingface.co/papers/2405.17484) provides a new perspective connecting LoRA to OFT, which means it can harness the advantages of both strategies, reduce parameters and computation costs while penalizing the loss of pre-training knowledge. + +
+ +
+Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation + +HRA constructs a chain of `r` trainable Householder reflections (HRs). Because the Householder reflection matrix is an orthogonal matrix and the product of orthogonal matrices is also an orthogonal matrix, HRA satisfies the theoretical guarantee of Orthogonal Finetuning (OFT). Meanwhile, HRA can also be viewed as a low-rank fine-tuning adapter by rewriting formula. + +The higher `r`, the more trainable parameters, resulting in a larger model capacity and better performance. Besides, due to the chain structure, the orthogonality of HR planes impacts the capacity and regularity of HRA. To achieve a trade-off between the model capacity and regularity, an orthogonality regularizer of the HR planes is added to the loss function. The weight \\(\lambda\\) can control the strength of the regularizer. + +## Bone +[MiSS](https://huggingface.co/papers/2409.15371) New version of paper(MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing) +If you already have a Bone checkpoint, you can use `/scripts/convert-bone-to-miss.py` to convert it into a MiSS checkpoint and proceed with training using MiSS. + +## MiSS +[MiSS](https://huggingface.co/papers/2409.15371) MiSS (Matrix Shard Sharing) is a novel Parameter-Efficient Fine-Tuning (PEFT) method designed to address the trade-off between adaptability and efficiency in Large Language Models. The core approach of MiSS involves a simple shard-sharing mechanism. It achieves low-rank adaptation by decomposing a weight matrix into multiple fragments and then utilizing a shared, trainable "common fragment." The final low-rank update matrix is constructed by replicating these shared, partitioned shards. (MiSS is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency.) + +MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing + +Intuitively, the shape of a single trainable matrix in MiSS is consistent with `lora_B`, so the `r` parameter in MiSS is less than the `r` in LoRA by (`in_feature * r`). + +Note: Bat's r (b) is special and requires that weight W satisfies the conditions `in_features % r == 0` and `out_features % r == 0`. Additionally, when `in_features == out_features` and MiSS-r equals LoRA-r, MiSS's number of trainable parameters is only half that of LoRA. + +Although the nonlinear updates of Bat bring some performance improvements, they also increase computational overhead. Its main purpose is to provide researchers with a direction for improvement. Therefore, we recommend fine-tuning the comprehensive MiSS model instead. \ No newline at end of file diff --git a/peft/docs/source/conceptual_guides/ia3.md b/peft/docs/source/conceptual_guides/ia3.md new file mode 100644 index 0000000000000000000000000000000000000000..92daaac1053f3e2dec01b140f4f053bccece5805 --- /dev/null +++ b/peft/docs/source/conceptual_guides/ia3.md @@ -0,0 +1,68 @@ + + +# IA3 + +This conceptual guide gives a brief overview of [IA3](https://huggingface.co/papers/2205.05638), a parameter-efficient fine tuning technique that is +intended to improve over [LoRA](./lora). + +To make fine-tuning more efficient, IA3 (Infused Adapter by Inhibiting and Amplifying Inner Activations) +rescales inner activations with learned vectors. These learned vectors are injected in the attention and feedforward modules +in a typical transformer-based architecture. These learned vectors are the only trainable parameters during fine-tuning, and thus the original +weights remain frozen. Dealing with learned vectors (as opposed to learned low-rank updates to a weight matrix like LoRA) +keeps the number of trainable parameters much smaller. + +Being similar to LoRA, IA3 carries many of the same advantages: + +* IA3 makes fine-tuning more efficient by drastically reducing the number of trainable parameters. (For T0, an IA3 model only has about 0.01% trainable parameters, while even LoRA has > 0.1%) +* The original pre-trained weights are kept frozen, which means you can have multiple lightweight and portable IA3 models for various downstream tasks built on top of them. +* Performance of models fine-tuned using IA3 is comparable to the performance of fully fine-tuned models. +* IA3 does not add any inference latency because adapter weights can be merged with the base model. + +In principle, IA3 can be applied to any subset of weight matrices in a neural network to reduce the number of trainable +parameters. Following the authors' implementation, IA3 weights are added to the key, value and feedforward layers +of a Transformer model. To be specific, for transformer models, IA3 weights are added to the outputs of key and value layers, and to the input of the second feedforward layer +in each transformer block. + +Given the target layers for injecting IA3 parameters, the number of trainable parameters +can be determined based on the size of the weight matrices. + + +## Common IA3 parameters in PEFT + +As with other methods supported by PEFT, to fine-tune a model using IA3, you need to: + +1. Instantiate a base model. +2. Create a configuration (`IA3Config`) where you define IA3-specific parameters. +3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. +4. Train the `PeftModel` as you normally would train the base model. + +`IA3Config` allows you to control how IA3 is applied to the base model through the following parameters: + +- `target_modules`: The modules (for example, attention blocks) to apply the IA3 vectors. +- `feedforward_modules`: The list of modules to be treated as feedforward layers in `target_modules`. While learned vectors are multiplied with +the output activation for attention blocks, the vectors are multiplied with the input for classic feedforward layers. Note that `feedforward_modules` must be a subset of `target_modules`. +- `modules_to_save`: List of modules apart from IA3 layers to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. + +## Example Usage + +For the task of sequence classification, one can initialize the IA3 config for a Llama model as follows: + +```py +peft_config = IA3Config( + task_type=TaskType.SEQ_CLS, target_modules=["k_proj", "v_proj", "down_proj"], feedforward_modules=["down_proj"] +) +``` \ No newline at end of file diff --git a/peft/docs/source/conceptual_guides/oft.md b/peft/docs/source/conceptual_guides/oft.md new file mode 100644 index 0000000000000000000000000000000000000000..14e96db828e4e9025552aa840da03f8a3946025a --- /dev/null +++ b/peft/docs/source/conceptual_guides/oft.md @@ -0,0 +1,165 @@ + + +# Orthogonal Finetuning (OFT and BOFT) + +This conceptual guide gives a brief overview of [OFT](https://huggingface.co/papers/2306.07280), [OFTv2](https://www.arxiv.org/abs/2506.19847) and [BOFT](https://huggingface.co/papers/2311.06243), a parameter-efficient fine-tuning technique that utilizes orthogonal matrix to multiplicatively transform the pretrained weight matrices. + +To achieve efficient fine-tuning, OFT represents the weight updates with an orthogonal transformation. The orthogonal transformation is parameterized by an orthogonal matrix multiplied to the pretrained weight matrix. These new matrices can be trained to adapt to the new data while keeping the overall number of changes low. The original weight matrix remains frozen and doesn't receive any further adjustments. To produce the final results, both the original and the adapted weights are multiplied togethor. + +Orthogonal Butterfly (BOFT) generalizes OFT with Butterfly factorization and further improves its parameter efficiency and finetuning flexibility. In short, OFT can be viewed as a special case of BOFT. Different from LoRA that uses additive low-rank weight updates, BOFT uses multiplicative orthogonal weight updates. The comparison is shown below. + +
+ +
+ + +BOFT has some advantages compared to LoRA: + +* BOFT proposes a simple yet generic way to finetune pretrained models to downstream tasks, yielding a better preservation of pretraining knowledge and a better parameter efficiency. +* Through the orthogonality, BOFT introduces a structural constraint, i.e., keeping the [hyperspherical energy](https://huggingface.co/papers/1805.09298) unchanged during finetuning. This can effectively reduce the forgetting of pretraining knowledge. +* BOFT uses the butterfly factorization to efficiently parameterize the orthogonal matrix, which yields a compact yet expressive learning space (i.e., hypothesis class). +* The sparse matrix decomposition in BOFT brings in additional inductive biases that are beneficial to generalization. + +In principle, BOFT can be applied to any subset of weight matrices in a neural network to reduce the number of trainable parameters. Given the target layers for injecting BOFT parameters, the number of trainable parameters can be determined based on the size of the weight matrices. + +## Merge OFT/BOFT weights into the base model + +Similar to LoRA, the weights learned by OFT/BOFT can be integrated into the pretrained weight matrices using the merge_and_unload() function. This function merges the adapter weights with the base model which allows you to effectively use the newly merged model as a standalone model. + +
+ +
+ +This works because during training, the orthogonal weight matrix (R in the diagram above) and the pretrained weight matrices are separate. But once training is complete, these weights can actually be merged (multiplied) into a new weight matrix that is equivalent. + +## Utils for OFT / BOFT + +### Common OFT / BOFT parameters in PEFT + +As with other methods supported by PEFT, to fine-tune a model using OFT or BOFT, you need to: + +1. Instantiate a base model. +2. Create a configuration (`OFTConfig` or `BOFTConfig`) where you define OFT/BOFT-specific parameters. +3. Wrap the base model with `get_peft_model()` to get a trainable `PeftModel`. +4. Train the `PeftModel` as you normally would train the base model. + + +### OFT-specific parameters + +`OFTConfig` allows you to control how OFT is applied to the base model through the following parameters: + +- `r`: OFT rank, number of OFT blocks per injected layer. **Bigger** `r` results in more sparse update matrices with **fewer** trainable paramters. **Note**: You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `r = 0`, the user is advised to set the `oft_block_size` instead for better clarity. +- `oft_block_size`: OFT block size across different layers. **Bigger** `oft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**: Please choose `oft_block_size` to be divisible by layer's input dimension (`in_features`), e.g., 4, 8, 16. You can only specify either `r` or `oft_block_size`, but not both simultaneously, because `r` × `oft_block_size` = layer dimension. For simplicity, we let the user speficy either `r` or `oft_block_size` and infer the other one. Default set to `oft_block_size = 32`. +- `use_cayley_neumann`: Specifies whether to use the Cayley-Neumann parameterization (efficient but approximate) or the vanilla Cayley parameterization (exact but computationally expensive because of matrix inverse). We recommend to set it to `True` for better efficiency, but performance may be slightly worse because of the approximation error. Please test both settings (`True` and `False`) depending on your needs. Default is `False`. +- `module_dropout`: The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the dropout layer in LoRA. +- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"oft_only"`. +- `target_modules`: The modules (for example, attention blocks) to inject the OFT matrices. +- `modules_to_save`: List of modules apart from OFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. + +### BOFT-specific parameters + +`BOFTConfig` allows you to control how BOFT is applied to the base model through the following parameters: + +- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. **Bigger** `boft_block_size` results in more dense update matrices with **more** trainable parameters. **Note**, please choose `boft_block_size` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only +specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. +- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. **Bigger** `boft_block_num` result in sparser update matrices with **fewer** trainable parameters. **Note**, please choose `boft_block_num` to be divisible by most layer's input dimension (`in_features`), e.g., 4, 8, 16. Also, please only +specify either `boft_block_size` or `boft_block_num`, but not both simultaneously or leaving both to 0, because `boft_block_size` x `boft_block_num` must equal the layer's input dimension. +- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks become half. +- `bias`: specify if the `bias` parameters should be trained. Can be `"none"`, `"all"` or `"boft_only"`. +- `boft_dropout`: specify the probability of multiplicative dropout. +- `target_modules`: The modules (for example, attention blocks) to inject the OFT/BOFT matrices. +- `modules_to_save`: List of modules apart from OFT/BOFT matrices to be set as trainable and saved in the final checkpoint. These typically include model's custom head that is randomly initialized for the fine-tuning task. + + + +## OFT Example Usage + +For using OFT for quantized finetuning with [TRL](https://github.com/huggingface/trl) for `SFT`, `PPO`, or `DPO` fine-tuning, follow the following outline: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +from trl import SFTTrainer +from peft import OFTConfig + +if use_quantization: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_storage=torch.bfloat16, + ) + +model = AutoModelForCausalLM.from_pretrained( + "model_name", + quantization_config=bnb_config +) +tokenizer = AutoTokenizer.from_pretrained("model_name") + +# Configure OFT +peft_config = OFTConfig( + oft_block_size=32, + use_cayley_neumann=True, + target_modules="all-linear", + bias="none", + task_type="CAUSAL_LM" +) + +trainer = SFTTrainer( + model=model, + train_dataset=ds['train'], + peft_config=peft_config, + processing_class=tokenizer, + args=training_arguments, + data_collator=collator, +) + +trainer.train() +``` + + +## BOFT Example Usage + +For an example of the BOFT method application to various downstream tasks, please refer to the following guides: + +Take a look at the following step-by-step guides on how to finetune a model with BOFT: +- [Dreambooth finetuning with BOFT](https://github.com/huggingface/peft/blob/main/examples/boft_dreambooth/boft_dreambooth.md) +- [Controllable generation finetuning with BOFT (ControlNet)](https://github.com/huggingface/peft/blob/main/examples/boft_controlnet/boft_controlnet.md) + +For the task of image classification, one can initialize the BOFT config for a DinoV2 model as follows: + +```py +import transformers +from transformers import AutoModelForSeq2SeqLM, BOFTConfig +from peft import BOFTConfig, get_peft_model + +config = BOFTConfig( + boft_block_size=4, + boft_n_butterfly_factor=2, + target_modules=["query", "value", "key", "output.dense", "mlp.fc1", "mlp.fc2"], + boft_dropout=0.1, + bias="boft_only", + modules_to_save=["classifier"], +) + +model = transformers.Dinov2ForImageClassification.from_pretrained( + "facebook/dinov2-large", + num_labels=100, +) + +boft_model = get_peft_model(model, config) +``` diff --git a/peft/docs/source/conceptual_guides/prompting.md b/peft/docs/source/conceptual_guides/prompting.md new file mode 100644 index 0000000000000000000000000000000000000000..733ffbf461e695adb9d2eaf19ab89d894ee54cd0 --- /dev/null +++ b/peft/docs/source/conceptual_guides/prompting.md @@ -0,0 +1,93 @@ + + +# Soft prompts + +Training large pretrained language models is very time-consuming and compute-intensive. As they continue to grow in size, there is increasing interest in more efficient training methods such as *prompting*. Prompting primes a frozen pretrained model for a specific downstream task by including a text prompt that describes the task or even demonstrates an example of the task. With prompting, you can avoid fully training a separate model for each downstream task, and use the same frozen pretrained model instead. This is a lot easier because you can use the same model for several different tasks, and it is significantly more efficient to train and store a smaller set of prompt parameters than to train all the model's parameters. + +There are two categories of prompting methods: + +- hard prompts are manually handcrafted text prompts with discrete input tokens; the downside is that it requires a lot of effort to create a good prompt +- soft prompts are learnable tensors concatenated with the input embeddings that can be optimized to a dataset; the downside is that they aren't human readable because you aren't matching these "virtual tokens" to the embeddings of a real word + +This conceptual guide provides a brief overview of the soft prompt methods included in 🤗 PEFT: prompt tuning, prefix tuning, P-tuning, and multitask prompt tuning. + +## Prompt tuning + +
+ +
+Only train and store a significantly smaller set of task-specific prompt parameters (image source). + +[Prompt tuning](https://hf.co/papers/2104.08691) was developed for text classification tasks on T5 models, and all downstream tasks are cast as a text generation task. For example, sequence classification usually assigns a single class label to a sequence of text. By casting it as a text generation task, the tokens that make up the class label are *generated*. Prompts are added to the input as a series of tokens. Typically, the model parameters are fixed which means the prompt tokens are also fixed by the model parameters. + +The key idea behind prompt tuning is that prompt tokens have their own parameters that are updated independently. This means you can keep the pretrained model's parameters frozen, and only update the gradients of the prompt token embeddings. The results are comparable to the traditional method of training the entire model, and prompt tuning performance scales as model size increases. + +Take a look at [Prompt tuning for causal language modeling](../task_guides/clm-prompt-tuning) for a step-by-step guide on how to train a model with prompt tuning. + +## Prefix tuning + +
+ +
+Optimize the prefix parameters for each task (image source). + +[Prefix tuning](https://hf.co/papers/2101.00190) was designed for natural language generation (NLG) tasks on GPT models. It is very similar to prompt tuning; prefix tuning also prepends a sequence of task-specific vectors to the input that can be trained and updated while keeping the rest of the pretrained model's parameters frozen. + +The main difference is that the prefix parameters are inserted in **all** of the model layers, whereas prompt tuning only adds the prompt parameters to the model input embeddings. The prefix parameters are also optimized by a separate feed-forward network (FFN) instead of training directly on the soft prompts because it causes instability and hurts performance. The FFN is discarded after updating the soft prompts. + +As a result, the authors found that prefix tuning demonstrates comparable performance to fully finetuning a model, despite having 1000x fewer parameters, and it performs even better in low-data settings. + +Take a look at [Prefix tuning for conditional generation](../task_guides/seq2seq-prefix-tuning) for a step-by-step guide on how to train a model with prefix tuning. + +## P-tuning + +
+ +
+Prompt tokens can be inserted anywhere in the input sequence, and they are optimized by a prompt encoder (image source). + +[P-tuning](https://hf.co/papers/2103.10385) is designed for natural language understanding (NLU) tasks and all language models. +It is another variation of a soft prompt method; P-tuning also adds a trainable embedding tensor that can be optimized to find better prompts, and it uses a prompt encoder (a bidirectional long-short term memory network or LSTM) to optimize the prompt parameters. Unlike prefix tuning though: + +- the prompt tokens can be inserted anywhere in the input sequence, and it isn't restricted to only the beginning +- the prompt tokens are only added to the input instead of adding them to every layer of the model +- introducing *anchor* tokens can improve performance because they indicate characteristics of a component in the input sequence + +The results suggest that P-tuning is more efficient than manually crafting prompts, and it enables GPT-like models to compete with BERT-like models on NLU tasks. + +Take a look at [P-tuning for sequence classification](../task_guides/ptuning-seq-classification) for a step-by-step guide on how to train a model with P-tuning. + +## Multitask prompt tuning + +
+ +
+Multitask prompt tuning enables parameter-efficient transfer learning. + +[Multitask prompt tuning (MPT)](https://hf.co/papers/2303.02861) learns a single prompt from data for multiple task types that can be shared for different target tasks. Other existing approaches learn a separate soft prompt for each task that need to be retrieved or aggregated for adaptation to target tasks. MPT consists of two stages: + +1. source training - for each task, its soft prompt is decomposed into task-specific vectors. The task-specific vectors are multiplied together to form another matrix W, and the Hadamard product is used between W and a shared prompt matrix P to generate a task-specific prompt matrix. The task-specific prompts are distilled into a single prompt matrix that is shared across all tasks. This prompt is trained with multitask training. +2. target adaptation - to adapt the single prompt for a target task, a target prompt is initialized and expressed as the Hadamard product of the shared prompt matrix and the task-specific low-rank prompt matrix. + +
+ +
+Prompt decomposition. + + +## Context-Aware Prompt Tuning (CPT) + +
+ +
+CPT optimizing only specific token embeddings while keeping the rest of the model frozen (image source). + +[Context-Aware Prompt Tuning (CPT)](https://huggingface.co/papers/2410.17222) is designed to enhance few-shot classification by refining only context embeddings. +This approach combines ideas from In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization, focusing on making model adaptation both parameter-efficient and effective. +In CPT, only specific context token embeddings are optimized, while the rest of the model remains frozen. +To prevent overfitting and maintain stability, CPT uses controlled perturbations to limit the allowed changes to context embeddings within a defined range. +Additionally, to address the phenomenon of recency bias—where examples near the end of the context tend to be prioritized over earlier ones—CPT applies a decay loss factor. + +Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT. diff --git a/peft/docs/source/developer_guides/checkpoint.md b/peft/docs/source/developer_guides/checkpoint.md new file mode 100644 index 0000000000000000000000000000000000000000..ea05731e56f0ad3582ab4831584ea7fe1204c422 --- /dev/null +++ b/peft/docs/source/developer_guides/checkpoint.md @@ -0,0 +1,244 @@ + + +# PEFT checkpoint format + +This document describes how PEFT's checkpoint files are structured and how to convert between the PEFT format and other formats. + +## PEFT files + +PEFT (parameter-efficient fine-tuning) methods only update a small subset of a model's parameters rather than all of them. This is nice because checkpoint files can generally be much smaller than the original model files and are easier to store and share. However, this also means that to load a PEFT model, you need to have the original model available as well. + +When you call [`~PeftModel.save_pretrained`] on a PEFT model, the PEFT model saves three files, described below: + +1. `adapter_model.safetensors` or `adapter_model.bin` + +By default, the model is saved in the `safetensors` format, a secure alternative to the `bin` format, which is known to be susceptible to [security vulnerabilities](https://huggingface.co/docs/hub/security-pickle) because it uses the pickle utility under the hood. Both formats store the same `state_dict` though, and are interchangeable. + +The `state_dict` only contains the parameters of the adapter module, not the base model. To illustrate the difference in size, a normal BERT model requires ~420MB of disk space, whereas an IA³ adapter on top of this BERT model only requires ~260KB. + +2. `adapter_config.json` + +The `adapter_config.json` file contains the configuration of the adapter module, which is necessary to load the model. Below is an example of an `adapter_config.json` for an IA³ adapter with standard settings applied to a BERT model: + +```json +{ + "auto_mapping": { + "base_model_class": "BertModel", + "parent_library": "transformers.models.bert.modeling_bert" + }, + "base_model_name_or_path": "bert-base-uncased", + "fan_in_fan_out": false, + "feedforward_modules": [ + "output.dense" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "key", + "value", + "output.dense" + ], + "task_type": null +} +``` + +The configuration file contains: + +- the adapter module type stored, `"peft_type": "IA3"` +- information about the base model like `"base_model_name_or_path": "bert-base-uncased"` +- the revision of the model (if any), `"revision": null` + +If the base model is not a pretrained Transformers model, the latter two entries will be `null`. Other than that, the settings are all related to the specific IA³ adapter that was used to fine-tune the model. + +3. `README.md` + +The generated `README.md` is the model card of a PEFT model and contains a few pre-filled entries. The intent of this is to make it easier to share the model with others and to provide some basic information about the model. This file is not needed to load the model. + +## Convert to PEFT format + +When converting from another format to the PEFT format, we require both the `adapter_model.safetensors` (or `adapter_model.bin`) file and the `adapter_config.json` file. + +### adapter_model + +For the model weights, it is important to use the correct mapping from parameter name to value for PEFT to load the file. Getting this mapping right is an exercise in checking the implementation details, as there is no generally agreed upon format for PEFT adapters. + +Fortunately, figuring out this mapping is not overly complicated for common base cases. Let's look at a concrete example, the [`LoraLayer`](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py): + +```python +# showing only part of the code + +class LoraLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.r = {} + self.lora_alpha = {} + self.scaling = {} + self.lora_dropout = nn.ModuleDict({}) + self.lora_A = nn.ModuleDict({}) + self.lora_B = nn.ModuleDict({}) + # For Embedding layer + self.lora_embedding_A = nn.ParameterDict({}) + self.lora_embedding_B = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.use_dora: dict[str, bool] = {} + self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None # for DoRA + self._caches: dict[str, Any] = {} + self.kwargs = kwargs +``` + +In the `__init__` code used by all `LoraLayer` classes in PEFT, there are a bunch of parameters used to initialize the model, but only a few are relevant for the checkpoint file: `lora_A`, `lora_B`, `lora_embedding_A`, and `lora_embedding_B`. These parameters are listed in the class attribute `adapter_layer_names` and contain the learnable parameters, so they must be included in the checkpoint file. All the other parameters, like the rank `r`, are derived from the `adapter_config.json` and must be included there (unless the default value is used). + +Let's check the `state_dict` of a PEFT LoRA model applied to BERT. When printing the first five keys using the default LoRA settings (the remaining keys are the same, just with different layer numbers), we get: + +- `base_model.model.encoder.layer.0.attention.self.query.lora_A.weight` +- `base_model.model.encoder.layer.0.attention.self.query.lora_B.weight` +- `base_model.model.encoder.layer.0.attention.self.value.lora_A.weight` +- `base_model.model.encoder.layer.0.attention.self.value.lora_B.weight` +- `base_model.model.encoder.layer.1.attention.self.query.lora_A.weight` +- etc. + +Let's break this down: + +- By default, for BERT models, LoRA is applied to the `query` and `value` layers of the attention module. This is why you see `attention.self.query` and `attention.self.value` in the key names for each layer. +- LoRA decomposes the weights into two low-rank matrices, `lora_A` and `lora_B`. This is where `lora_A` and `lora_B` come from in the key names. +- These LoRA matrices are implemented as `nn.Linear` layers, so the parameters are stored in the `.weight` attribute (`lora_A.weight`, `lora_B.weight`). +- By default, LoRA isn't applied to BERT's embedding layer, so there are _no entries_ for `lora_A_embedding` and `lora_B_embedding`. +- The keys of the `state_dict` always start with `"base_model.model."`. The reason is that, in PEFT, we wrap the base model inside a tuner-specific model (`LoraModel` in this case), which itself is wrapped in a general PEFT model (`PeftModel`). For this reason, these two prefixes are added to the keys. When converting to the PEFT format, it is required to add these prefixes. + +> [!TIP] +> This last point is not true for prefix tuning techniques like prompt tuning. There, the extra embeddings are directly stored in the `state_dict` without any prefixes added to the keys. + +When inspecting the parameter names in the loaded model, you might be surprised to find that they look a bit different, e.g. `base_model.model.encoder.layer.0.attention.self.query.lora_A.default.weight`. The difference is the *`.default`* part in the second to last segment. This part exists because PEFT generally allows the addition of multiple adapters at once (using an `nn.ModuleDict` or `nn.ParameterDict` to store them). For example, if you add another adapter called "other", the key for that adapter would be `base_model.model.encoder.layer.0.attention.self.query.lora_A.other.weight`. + +When you call [`~PeftModel.save_pretrained`], the adapter name is stripped from the keys. The reason is that the adapter name is not an important part of the model architecture; it is just an arbitrary name. When loading the adapter, you could choose a totally different name, and the model would still work the same way. This is why the adapter name is not stored in the checkpoint file. + +> [!TIP] +> If you call `save_pretrained("some/path")` and the adapter name is not `"default"`, the adapter is stored in a sub-directory with the same name as the adapter. So if the name is "other", it would be stored inside of `some/path/other`. + +In some circumstances, deciding which values to add to the checkpoint file can become a bit more complicated. For example, in PEFT, DoRA is implemented as a special case of LoRA. If you want to convert a DoRA model to PEFT, you should create a LoRA checkpoint with extra entries for DoRA. You can see this in the `__init__` of the previous `LoraLayer` code: + +```python +self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None # for DoRA +``` + +This indicates that there is an optional extra parameter per layer for DoRA. + +### adapter_config + +All the other information needed to load a PEFT model is contained in the `adapter_config.json` file. Let's check this file for a LoRA model applied to BERT: + +```json +{ + "alpha_pattern": {}, + "auto_mapping": { + "base_model_class": "BertModel", + "parent_library": "transformers.models.bert.modeling_bert" + }, + "base_model_name_or_path": "bert-base-uncased", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 8, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query", + "value" + ], + "task_type": null, + "use_dora": false, + "use_rslora": false +} +``` + +This contains a lot of entries, and at first glance, it could feel overwhelming to figure out all the right values to put in there. However, most of the entries are not necessary to load the model. This is either because they use the default values and don't need to be added or because they only affect the initialization of the LoRA weights, which is irrelevant when it comes to loading the model. If you find that you don't know what a specific parameter does, e.g., `"use_rslora",` don't add it, and you should be fine. Also note that as more options are added, this file will get more entries in the future, but it should be backward compatible. + +At the minimum, you should include the following entries: + +```json +{ + "target_modules": ["query", "value"], + "peft_type": "LORA" +} +``` + +However, adding as many entries as possible, like the rank `r` or the `base_model_name_or_path` (if it's a Transformers model) is recommended. This information can help others understand the model better and share it more easily. To check which keys and values are expected, check out the [config.py](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py) file (as an example, this is the config file for LoRA) in the PEFT source code. + +## Model storage + +In some circumstances, you might want to store the whole PEFT model, including the base weights. This can be necessary if, for instance, the base model is not available to the users trying to load the PEFT model. You can merge the weights first or convert it into a Transformer model. + +### Merge the weights + +The most straightforward way to store the whole PEFT model is to merge the adapter weights into the base weights: + +```python +merged_model = model.merge_and_unload() +merged_model.save_pretrained(...) +``` + +There are some disadvantages to this approach, though: + +- Once [`~LoraModel.merge_and_unload`] is called, you get a basic model without any PEFT-specific functionality. This means you can't use any of the PEFT-specific methods anymore. +- You cannot unmerge the weights, load multiple adapters at once, disable the adapter, etc. +- Not all PEFT methods support merging weights. +- Some PEFT methods may generally allow merging, but not with specific settings (e.g. when using certain quantization techniques). +- The whole model will be much larger than the PEFT model, as it will contain all the base weights as well. + +But inference with a merged model should be a bit faster. + +### Convert to a Transformers model + +Another way to save the whole model, assuming the base model is a Transformers model, is to use this hacky approach to directly insert the PEFT weights into the base model and save it, which only works if you "trick" Transformers into believing the PEFT model is not a PEFT model. This only works with LoRA because other adapters are not implemented in Transformers. + +```python +model = ... # the PEFT model +... +# after you finish training the model, save it in a temporary location +model.save_pretrained() +# now load this model directly into a transformers model, without the PEFT wrapper +# the PEFT weights are directly injected into the base model +model_loaded = AutoModel.from_pretrained() +# now make the loaded model believe that it is _not_ a PEFT model +model_loaded._hf_peft_config_loaded = False +# now when we save it, it will save the whole model +model_loaded.save_pretrained() +# or upload to Hugging Face Hub +model_loaded.push_to_hub() +``` + diff --git a/peft/docs/source/developer_guides/contributing.md b/peft/docs/source/developer_guides/contributing.md new file mode 100644 index 0000000000000000000000000000000000000000..40f8582290034e29f7065469f857500c775dc0b7 --- /dev/null +++ b/peft/docs/source/developer_guides/contributing.md @@ -0,0 +1,96 @@ + + +# Contribute to PEFT + +We are happy to accept contributions to PEFT. If you plan to contribute, please read this to make the process as smooth as possible. + +## Installation + +For code contributions to PEFT, you should choose the ["source"](../install#source) installation method. + +If you are new to creating a pull request, follow the [Creating a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) guide by GitHub. + +## Tests and code quality checks + +Regardless of the contribution type (unless it’s only about the docs), you should run tests and code quality checks before creating a PR to ensure your contribution doesn’t break anything and follows the project standards. + +We provide a Makefile to execute the necessary tests. Run the code below for the unit test: + +```sh +make test +``` + +Run one of the following to either only check or check and fix code quality and style: + +```sh +make quality # just check +make style # check and fix +``` + +You can also set up [`pre-commit`](https://pre-commit.com/) to run these fixes +automatically as Git commit hooks. + +```bash +$ pip install pre-commit +$ pre-commit install +``` + +Running all the tests can take a while, so during development it can be more efficient to only [run tests specific to your change](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests), e.g. via: + +```sh +pytest tests/ -k +``` + +This should finish much quicker and allow for faster iteration. + +If your change is specific to a hardware setting (e.g., it requires CUDA), take a look at [tests/test_gpu_examples.py](https://github.com/huggingface/peft/blob/1c1c7fdaa6e6abaa53939b865dee1eded82ad032/tests/test_gpu_examples.py) and [tests/test_common_gpu.py](https://github.com/huggingface/peft/blob/1c1c7fdaa6e6abaa53939b865dee1eded82ad032/tests/test_common_gpu.py) to see if it makes sense to add tests there. If your change could have an effect on saving and loading models, please run the tests with the `--regression` flag to trigger regression tests. + +It can happen that while you’re working on your PR, the underlying code base changes due to other changes being merged. If that happens – especially when there is a merge conflict – please update your branch with the latest changes. This can be a merge or a rebase, and we'll squash and merge the PR once it’s ready. If possible, avoid force pushes to make reviews easier. + +## PR description + +When opening a PR, please provide a nice description of the change you're proposing. If it relates to other issues or PRs, please reference them. Providing a good description not only helps the reviewers review your code better and faster, it can also be used later (as a basis) for the commit message which helps with long term maintenance of the project. + +If your code makes some non-trivial changes, it may also be a good idea to add comments to the code to explain those changes. For example, if you had to iterate on your implementation multiple times because the most obvious way didn’t work, it’s a good indication that a code comment is needed. + +## Bugfixes + +Please give a description of the circumstances that led to the bug. If there is an existing issue, please link to it (e.g., “Resolves #12345”). + +Ideally when a bugfix is provided, it should be accompanied by a test for the bug. The test should fail with the current code and pass with the bugfix. Add a comment to the test that references the issue or PR. Without a test, it is more difficult to prevent regressions in the future. + +## Add a new fine-tuning method + +New parameter-efficient fine-tuning methods are developed all the time. If you would like to add a new and promising method to PEFT, please follow these steps. + +1. Before you start to implement the new method, please open a [GitHub issue](https://github.com/huggingface/peft/issues) with your proposal. This way, the maintainers can give you some early feedback. +2. Please add a link to the source (usually a paper) of the method. The paper should be in a final state to avoid changing requirements during development (e.g. due to reviewer feedback). +3. When implementing the method, it makes sense to look for existing implementations that already exist as a guide. Moreover, when you structure your code, please take inspiration from the other PEFT methods. For example, if your method is similar to LoRA, it makes sense to structure your code similarly or even reuse some functions or classes where it makes sense (some code duplication is okay, but don’t overdo it). +4. Ideally, in addition to the implementation of the new method, there should also be + - [examples](https://github.com/huggingface/peft/tree/main/examples) (notebooks, scripts) + - [documentation](https://github.com/huggingface/peft/tree/main/docs/source) + - [extensive test suite](https://github.com/huggingface/peft/tree/main/tests) that proves the method correctly integrates with PEFT + - [experimental setup](https://github.com/huggingface/peft/tree/main/method_comparison#creating-new-experiments) to run benchmarks +5. Once you have something that seems to be working, don’t hesitate to create a draft PR even if it’s not in a mergeable state yet. The maintainers are happy to give you feedback and guidance along the way. + +## Add other features + +It is best if you first open an issue on GitHub with a proposal to add the new feature. This way, you can discuss with the maintainers if it makes sense to add the feature before spending too much time on implementing it. + +New features should generally be accompanied by tests and documentation or examples. Without the latter, users will have a hard time discovering your cool new feature. + +Changes to the code should be implemented in a backward-compatible way. For example, existing code should continue to work the same way after the feature is merged. diff --git a/peft/docs/source/developer_guides/custom_models.md b/peft/docs/source/developer_guides/custom_models.md new file mode 100644 index 0000000000000000000000000000000000000000..b31f3be91705293bc596997d04925eef2e50ea62 --- /dev/null +++ b/peft/docs/source/developer_guides/custom_models.md @@ -0,0 +1,304 @@ + + +# Custom models + +Some fine-tuning techniques, such as prompt tuning, are specific to language models. That means in 🤗 PEFT, it is +assumed a 🤗 Transformers model is being used. However, other fine-tuning techniques - like +[LoRA](../conceptual_guides/lora) - are not restricted to specific model types. + +In this guide, we will see how LoRA can be applied to a multilayer perceptron, a computer vision model from the [timm](https://huggingface.co/docs/timm/index) library, or a new 🤗 Transformers architecture. + +## Multilayer perceptron + +Let's assume that we want to fine-tune a multilayer perceptron with LoRA. Here is the definition: + +```python +from torch import nn + + +class MLP(nn.Module): + def __init__(self, num_units_hidden=2000): + super().__init__() + self.seq = nn.Sequential( + nn.Linear(20, num_units_hidden), + nn.ReLU(), + nn.Linear(num_units_hidden, num_units_hidden), + nn.ReLU(), + nn.Linear(num_units_hidden, 2), + nn.LogSoftmax(dim=-1), + ) + + def forward(self, X): + return self.seq(X) +``` + +This is a straightforward multilayer perceptron with an input layer, a hidden layer, and an output layer. + +> [!TIP] +> For this toy example, we choose an exceedingly large number of hidden units to highlight the efficiency gains +> from PEFT, but those gains are in line with more realistic examples. + +There are a few linear layers in this model that could be tuned with LoRA. When working with common 🤗 Transformers +models, PEFT will know which layers to apply LoRA to, but in this case, it is up to us as a user to choose the layers. +To determine the names of the layers to tune: + +```python +print([(n, type(m)) for n, m in MLP().named_modules()]) +``` + +This should print: + +``` +[('', __main__.MLP), + ('seq', torch.nn.modules.container.Sequential), + ('seq.0', torch.nn.modules.linear.Linear), + ('seq.1', torch.nn.modules.activation.ReLU), + ('seq.2', torch.nn.modules.linear.Linear), + ('seq.3', torch.nn.modules.activation.ReLU), + ('seq.4', torch.nn.modules.linear.Linear), + ('seq.5', torch.nn.modules.activation.LogSoftmax)] +``` + +Let's say we want to apply LoRA to the input layer and to the hidden layer, those are `'seq.0'` and `'seq.2'`. Moreover, +let's assume we want to update the output layer without LoRA, that would be `'seq.4'`. The corresponding config would +be: + +```python +from peft import LoraConfig + +config = LoraConfig( + target_modules=["seq.0", "seq.2"], + modules_to_save=["seq.4"], +) +``` + +With that, we can create our PEFT model and check the fraction of parameters trained: + +```python +from peft import get_peft_model + +model = MLP() +peft_model = get_peft_model(model, config) +peft_model.print_trainable_parameters() +# prints trainable params: 56,164 || all params: 4,100,164 || trainable%: 1.369798866581922 +``` + +Finally, we can use any training framework we like, or write our own fit loop, to train the `peft_model`. + +For a complete example, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb). + +## timm models + +The [timm](https://huggingface.co/docs/timm/index) library contains a large number of pretrained computer vision models. +Those can also be fine-tuned with PEFT. Let's check out how this works in practice. + +To start, ensure that timm is installed in the Python environment: + +```bash +python -m pip install -U timm +``` + +Next we load a timm model for an image classification task: + +```python +import timm + +num_classes = ... +model_id = "timm/poolformer_m36.sail_in1k" +model = timm.create_model(model_id, pretrained=True, num_classes=num_classes) +``` + +Again, we need to make a decision about what layers to apply LoRA to. Since LoRA supports 2D conv layers, and since +those are a major building block of this model, we should apply LoRA to the 2D conv layers. To identify the names of +those layers, let's look at all the layer names: + +```python +print([(n, type(m)) for n, m in model.named_modules()]) +``` + +This will print a very long list, we'll only show the first few: + +``` +[('', timm.models.metaformer.MetaFormer), + ('stem', timm.models.metaformer.Stem), + ('stem.conv', torch.nn.modules.conv.Conv2d), + ('stem.norm', torch.nn.modules.linear.Identity), + ('stages', torch.nn.modules.container.Sequential), + ('stages.0', timm.models.metaformer.MetaFormerStage), + ('stages.0.downsample', torch.nn.modules.linear.Identity), + ('stages.0.blocks', torch.nn.modules.container.Sequential), + ('stages.0.blocks.0', timm.models.metaformer.MetaFormerBlock), + ('stages.0.blocks.0.norm1', timm.layers.norm.GroupNorm1), + ('stages.0.blocks.0.token_mixer', timm.models.metaformer.Pooling), + ('stages.0.blocks.0.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d), + ('stages.0.blocks.0.drop_path1', torch.nn.modules.linear.Identity), + ('stages.0.blocks.0.layer_scale1', timm.models.metaformer.Scale), + ('stages.0.blocks.0.res_scale1', torch.nn.modules.linear.Identity), + ('stages.0.blocks.0.norm2', timm.layers.norm.GroupNorm1), + ('stages.0.blocks.0.mlp', timm.layers.mlp.Mlp), + ('stages.0.blocks.0.mlp.fc1', torch.nn.modules.conv.Conv2d), + ('stages.0.blocks.0.mlp.act', torch.nn.modules.activation.GELU), + ('stages.0.blocks.0.mlp.drop1', torch.nn.modules.dropout.Dropout), + ('stages.0.blocks.0.mlp.norm', torch.nn.modules.linear.Identity), + ('stages.0.blocks.0.mlp.fc2', torch.nn.modules.conv.Conv2d), + ('stages.0.blocks.0.mlp.drop2', torch.nn.modules.dropout.Dropout), + ('stages.0.blocks.0.drop_path2', torch.nn.modules.linear.Identity), + ('stages.0.blocks.0.layer_scale2', timm.models.metaformer.Scale), + ('stages.0.blocks.0.res_scale2', torch.nn.modules.linear.Identity), + ('stages.0.blocks.1', timm.models.metaformer.MetaFormerBlock), + ('stages.0.blocks.1.norm1', timm.layers.norm.GroupNorm1), + ('stages.0.blocks.1.token_mixer', timm.models.metaformer.Pooling), + ('stages.0.blocks.1.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d), + ... + ('head.global_pool.flatten', torch.nn.modules.linear.Identity), + ('head.norm', timm.layers.norm.LayerNorm2d), + ('head.flatten', torch.nn.modules.flatten.Flatten), + ('head.drop', torch.nn.modules.linear.Identity), + ('head.fc', torch.nn.modules.linear.Linear)] + ] +``` + +Upon closer inspection, we see that the 2D conv layers have names such as `"stages.0.blocks.0.mlp.fc1"` and +`"stages.0.blocks.0.mlp.fc2"`. How can we match those layer names specifically? You can write a [regular +expressions](https://docs.python.org/3/library/re.html) to match the layer names. For our case, the regex +`r".*\.mlp\.fc\d"` should do the job. + +Furthermore, as in the first example, we should ensure that the output layer, in this case the classification head, is +also updated. Looking at the end of the list printed above, we can see that it's named `'head.fc'`. With that in mind, +here is our LoRA config: + +```python +config = LoraConfig(target_modules=r".*\.mlp\.fc\d", modules_to_save=["head.fc"]) +``` + +Then we only need to create the PEFT model by passing our base model and the config to `get_peft_model`: + +```python +peft_model = get_peft_model(model, config) +peft_model.print_trainable_parameters() +# prints trainable params: 1,064,454 || all params: 56,467,974 || trainable%: 1.88505789139876 +``` + +This shows us that we only need to train less than 2% of all parameters, which is a huge efficiency gain. + +For a complete example, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/image_classification/image_classification_timm_peft_lora.ipynb). + +## New transformers architectures + +When new popular transformers architectures are released, we do our best to quickly add them to PEFT. If you come across a transformers model that is not supported out of the box, don't worry, it will most likely still work if the config is set correctly. Specifically, you have to identify the layers that should be adapted and set them correctly when initializing the corresponding config class, e.g. `LoraConfig`. Here are some tips to help with this. + +As a first step, it is a good idea to check the existing models for inspiration. You can find them inside of [constants.py](https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py) in the PEFT repository. Often, you'll find a similar architecture that uses the same names. For example, if the new model architecture is a variation of the "mistral" model and you want to apply LoRA, you can see that the entry for "mistral" in `TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING` contains `["q_proj", "v_proj"]`. This tells you that for "mistral" models, the `target_modules` for LoRA should be `["q_proj", "v_proj"]`: + +```python +from peft import LoraConfig, get_peft_model + +my_mistral_model = ... +config = LoraConfig( + target_modules=["q_proj", "v_proj"], + ..., # other LoRA arguments +) +peft_model = get_peft_model(my_mistral_model, config) +``` + +If that doesn't help, check the existing modules in your model architecture with the `named_modules` method and try to identify the attention layers, especially the key, query, and value layers. Those will often have names such as `c_attn`, `query`, `q_proj`, etc. The key layer is not always adapted, and ideally, you should check whether including it results in better performance. + +Additionally, linear layers are common targets to be adapted (e.g. in [QLoRA paper](https://huggingface.co/papers/2305.14314), authors suggest to adapt them as well). Their names will often contain the strings `fc` or `dense`. + +If you want to add a new model to PEFT, please create an entry in [constants.py](https://github.com/huggingface/peft/blob/main/src/peft/utils/constants.py) and open a pull request on the [repository](https://github.com/huggingface/peft/pulls). Don't forget to update the [README](https://github.com/huggingface/peft#models-support-matrix) as well. + +## Verify parameters and layers + +You can verify whether you've correctly applied a PEFT method to your model in a few ways. + +* Check the fraction of parameters that are trainable with the [`~PeftModel.print_trainable_parameters`] method. If this number is lower or higher than expected, check the model `repr` by printing the model. This shows the names of all the layer types in the model. Ensure that only the intended target layers are replaced by the adapter layers. For example, if LoRA is applied to `nn.Linear` layers, then you should only see `lora.Linear` layers being used. + +```py +peft_model.print_trainable_parameters() +``` + +* Another way you can view the adapted layers is to use the `targeted_module_names` attribute to list the name of each module that was adapted. + +```python +print(peft_model.targeted_module_names) +``` + +## Unsupported module types + +Methods like LoRA only work if the target modules are supported by PEFT. For example, it's possible to apply LoRA to `nn.Linear` and `nn.Conv2d` layers, but not, for instance, to `nn.LSTM`. If you find a layer class you want to apply PEFT to is not supported, you can: + + - define a custom mapping to dynamically dispatch custom modules in LoRA + - open an [issue](https://github.com/huggingface/peft/issues) and request the feature where maintainers will implement it or guide you on how to implement it yourself if demand for this module type is sufficiently high + +### Experimental support for dynamic dispatch of custom modules in LoRA + +> [!WARNING] +> This feature is experimental and subject to change, depending on its reception by the community. We will introduce a public and stable API if there is significant demand for it. + +PEFT supports an experimental API for custom module types for LoRA. Let's assume you have a LoRA implementation for LSTMs. Normally, you would not be able to tell PEFT to use it, even if it would theoretically work with PEFT. However, this is possible with dynamic dispatch of custom layers. + +The experimental API currently looks like this: + +```python +class MyLoraLSTMLayer: + ... + +base_model = ... # load the base model that uses LSTMs + +# add the LSTM layer names to target_modules +config = LoraConfig(..., target_modules=["lstm"]) +# define a mapping from base layer type to LoRA layer type +custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer} +# register the new mapping +config._register_custom_module(custom_module_mapping) +# after registration, create the PEFT model +peft_model = get_peft_model(base_model, config) +# do training +``` + +> [!TIP] +> When you call [`get_peft_model`], you will see a warning because PEFT does not recognize the targeted module type. In this case, you can ignore this warning. + +By supplying a custom mapping, PEFT first checks the base model's layers against the custom mapping and dispatches to the custom LoRA layer type if there is a match. If there is no match, PEFT checks the built-in LoRA layer types for a match. + +Therefore, this feature can also be used to override existing dispatch logic, e.g. if you want to use your own LoRA layer for `nn.Linear` instead of using the one provided by PEFT. + +When creating your custom LoRA module, please follow the same rules as the [existing LoRA modules](https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py). Some important constraints to consider: + +- The custom module should inherit from `nn.Module` and `peft.tuners.lora.layer.LoraLayer`. +- The `__init__` method of the custom module should have the positional arguments `base_layer` and `adapter_name`. After this, there are additional `**kwargs` that you are free to use or ignore. +- The learnable parameters should be stored in an `nn.ModuleDict` or `nn.ParameterDict`, where the key corresponds to the name of the specific adapter (remember that a model can have more than one adapter at a time). +- The name of these learnable parameter attributes should start with `"lora_"`, e.g. `self.lora_new_param = ...`. +- Some methods are optional, e.g. you only need to implement `merge` and `unmerge` if you want to support weight merging. + +Currently, the information about the custom module does not persist when you save the model. When loading the model, you have to register the custom modules again. + +```python +# saving works as always and includes the parameters of the custom modules +peft_model.save_pretrained() + +# loading the model later: +base_model = ... +# load the LoRA config that you saved earlier +config = LoraConfig.from_pretrained() +# register the custom module again, the same way as the first time +custom_module_mapping = {nn.LSTM: MyLoraLSTMLayer} +config._register_custom_module(custom_module_mapping) +# pass the config instance to from_pretrained: +peft_model = PeftModel.from_pretrained(model, tmp_path / "lora-custom-module", config=config) +``` + +If you use this feature and find it useful, or if you encounter problems, let us know by creating an issue or a discussion on GitHub. This allows us to estimate the demand for this feature and add a public API if it is sufficiently high. diff --git a/peft/docs/source/developer_guides/lora.md b/peft/docs/source/developer_guides/lora.md new file mode 100644 index 0000000000000000000000000000000000000000..51640a3dffa3a6713a767322909535f8f1a1bb71 --- /dev/null +++ b/peft/docs/source/developer_guides/lora.md @@ -0,0 +1,822 @@ + + +# LoRA + +LoRA is low-rank decomposition method to reduce the number of trainable parameters which speeds up finetuning large models and uses less memory. In PEFT, using LoRA is as easy as setting up a [`LoraConfig`] and wrapping it with [`get_peft_model`] to create a trainable [`PeftModel`]. + +This guide explores in more detail other options and features for using LoRA. + +## Initialization + +The initialization of LoRA weights is controlled by the parameter `init_lora_weights` in [`LoraConfig`]. By default, PEFT initializes LoRA weights with Kaiming-uniform for weight A and zeros for weight B resulting in an identity transform (same as the reference [implementation](https://github.com/microsoft/LoRA)). + +It is also possible to pass `init_lora_weights="gaussian"`. As the name suggests, this initializes weight A with a Gaussian distribution and zeros for weight B (this is how [Diffusers](https://huggingface.co/docs/diffusers/index) initializes LoRA weights). + +```py +from peft import LoraConfig + +config = LoraConfig(init_lora_weights="gaussian", ...) +``` + +There is also an option to set `init_lora_weights=False` which is useful for debugging and testing. This should be the only time you use this option. When choosing this option, the LoRA weights are initialized such that they do *not* result in an identity transform. + +```py +from peft import LoraConfig + +config = LoraConfig(init_lora_weights=False, ...) +``` + +### PiSSA +[PiSSA](https://huggingface.co/papers/2404.02948) initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. + +Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model: +```python +from peft import LoraConfig +config = LoraConfig(init_lora_weights="pissa", ...) +``` +Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time: +```python +lora_config = LoraConfig(init_lora_weights="pissa_niter_[number of iters]", ...) +``` +For detailed instruction on using PiSSA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning). + +### CorDA + +[CorDA](https://huggingface.co/papers/2406.05223) builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). +The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. +When preserving pre-trained knowledge is not a concern, +the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance. + +You need to configure the initialization method to "corda", and specify the mode of IPM or KPM and the dataset to collect covariance matrices. + +```py +@torch.no_grad() +def run_model(): + # Assume `model` and `dataset` is in context... + model.eval() + for batch in dataset: + model(**batch) + + +corda_config = CordaConfig( + corda_method="kpm", +) +lora_config = LoraConfig( + init_lora_weights="corda", + corda_config=corda_config, +) +preprocess_corda(model, lora_config, run_model=run_model) +peft_model = get_peft_model(model, lora_config) +``` + +For detailed instruction on using CorDA, please follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/corda_finetuning). + +### OLoRA +[OLoRA](https://huggingface.co/papers/2406.01775) utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance. + +You just need to pass a single additional option to use OLoRA: +```python +from peft import LoraConfig +config = LoraConfig(init_lora_weights="olora", ...) +``` +For more advanced usage, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/olora_finetuning). + +### EVA +[EVA](https://huggingface.co/papers/2410.07170) performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their "explained variance ratio" - a metric derived from the SVD analysis. + +You can use EVA by setting `init_lora_weights="eva"` and defining [`EvaConfig`] in [`LoraConfig`]: +```python +from peft import LoraConfig, EvaConfig +peft_config = LoraConfig( + init_lora_weights = "eva", + eva_config = EvaConfig(rho = 2.0), + ... +) +``` +The parameter `rho` (≥ 1.0) determines how much redistribution is allowed. When `rho=1.0` and `r=16`, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r. + +It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the `low_cpu_mem_usage` flag in [`get_peft_model`]: +```python +peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) +``` +Then, call [`initialize_lora_eva_weights`] to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning): +```python +initialize_lora_eva_weights(peft_model, dataloader) +``` +EVA works out of the box with bitsandbytes. Simply initialize the model with `quantization_config` and call [`initialize_lora_eva_weights`] as usual. + +> [!TIP] +> For further instructions on using EVA, please refer to our [documentation](https://github.com/huggingface/peft/tree/main/examples/eva_finetuning). + +### LoftQ + +#### Standard approach + +When quantizing the base model for QLoRA training, consider using the [LoftQ initialization](https://huggingface.co/papers/2310.08659), which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning). + +In general, for LoftQ to work best, it is recommended to target as many layers with LoRA as possible, since those not targeted cannot have LoftQ applied. This means that passing `LoraConfig(..., target_modules="all-linear")` will most likely give the best results. Also, you should use `nf4` as quant type in your quantization config when using 4bit quantization, i.e. `BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")`. + +#### A more convenient way + +An easier but more limited way to apply LoftQ initialization is to use the convenience function `replace_lora_weights_loftq`. This takes the quantized PEFT model as input and replaces the LoRA weights in-place with their LoftQ-initialized counterparts. + +```python +from peft import replace_lora_weights_loftq +from transformers import BitsAndBytesConfig + +bnb_config = BitsAndBytesConfig(load_in_4bit=True, ...) +base_model = AutoModelForCausalLM.from_pretrained(..., quantization_config=bnb_config) +# note: don't pass init_lora_weights="loftq" or loftq_config! +lora_config = LoraConfig(task_type="CAUSAL_LM") +peft_model = get_peft_model(base_model, lora_config) +replace_lora_weights_loftq(peft_model) +``` + +`replace_lora_weights_loftq` also allows you to pass a `callback` argument to give you more control over which layers should be modified or not, which empirically can improve the results quite a lot. To see a more elaborate example of this, check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb). + +`replace_lora_weights_loftq` implements only one iteration step of LoftQ. This means that only the LoRA weights are updated, instead of iteratively updating LoRA weights and quantized base model weights. This may lead to lower performance but has the advantage that we can use the original quantized weights derived from the base model, instead of having to keep an extra copy of modified quantized weights. Whether this tradeoff is worthwhile depends on the use case. + +At the moment, `replace_lora_weights_loftq` has these additional limitations: + +- Model files must be stored as a `safetensors` file. +- Only bitsandbytes 4bit quantization is supported. + +> [!TIP] +> Learn more about how PEFT works with quantization in the [Quantization](quantization) guide. + +### Rank-stabilized LoRA + +Another way to initialize [`LoraConfig`] is with the [rank-stabilized LoRA (rsLoRA)](https://huggingface.co/papers/2312.03732) method. The LoRA architecture scales each adapter during every forward pass by a fixed scalar which is set at initialization and depends on the rank `r`. The scalar is given by `lora_alpha/r` in the original implementation, but rsLoRA uses `lora_alpha/math.sqrt(r)` which stabilizes the adapters and increases the performance potential from using a higher `r`. + +```py +from peft import LoraConfig + +config = LoraConfig(use_rslora=True, ...) +``` +### Activated LoRA (aLoRA) + +Activated LoRA (aLoRA) is a low rank adapter architecture for Causal LMs that allows for reusing existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see https://huggingface.co/papers/2504.12397. + +This technique scans for the last occurence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token), and activates the adapter weights on tokens starting with the beginning of the invocation sequence (any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights). Weights on prior tokens are left un-adapted -- making the cache for those tokens interchangeable with base model cache due to the causal attention mask in Causal LMs. Usage is very similar to standard LoRA, with the key difference that this invocation sequence must be specified when the adapter is created: + +```py +from peft import LoraConfig + +config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...) +``` + +where `alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as +``` +invocation_string = "placeholder" +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` +where the tokenizer is the tokenizer for the base model. Note that we have `add_special_tokens=False` to avoid adding SOS/EOS tokens in our search string (which will most likely cause failure to find). + +**Notes** +* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. +* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. +* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. +* Beam search is not yet supported. +* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model, as this can complicate the target use case of both the base model and adapter model operating on overlapping context. That said, there is a possible workaround by first efficiently adding [trainable tokens](https://huggingface.co/docs/peft/en/package_reference/trainable_tokens) to the base model prior to training the adapter. + +#### Choice of invocation sequence and SFT design + +Each input must have the `alora_invocation_tokens` sequence present, it is not added automatically. To maximize model performance without compromising cache reuse, it is recommended to have the adapter weights activated early, i.e. at the start of any adapter-specific prompting, but after any long inputs such as prior generations or documents. As with any model, +formatting should be consistent between train and test. + +Consider the following example, where the base model has a chat template, +and the goal is to train the adapter to generate a desired output. + +* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. +* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. + +Once deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as +``` +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` + +An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). + +**Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. + +To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. + +#### Using (and reusing) cache for generation +The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: +1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. Example: the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response. +2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. Example: The user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, an aLoRA checks if these documents are indeed relevant to the question, and then the base model generates an answer. + + +To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Care must be taken to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2). + +**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: +``` +from transformers import DynamicCache +... +cache = DynamicCache() +inputs_base = tokenizer(prompt_base, return_tensors="pt") +# Generate from base model and save cache +with model_alora.disable_adapter(): + output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True) +output_text_base = tokenizer.decode(output.sequences[0]) +cache = output.past_key_values + +# Generate with aLoRA adapter from cache +prompt_alora = output_text + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Note: cache is now tainted with adapter values and cannot be used in base model from here on! +``` + +**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. + +``` +from transformers import DynamicCache +import copy +... +cache = DynamicCache() +inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device) + +# Prefill from base model and save cache +with model_alora.disable_adapter(): + with torch.no_grad(): + model_alora(**inputs_shared, past_key_values=cache) +cache_copy = copy.deepcopy(cache) + +# Generate from aLoRA using prefilled cache +prompt_alora = prompt_shared + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Generate from base model using saved cache not tainted by aLoRA KV values +prompt_base = prompt_shared +inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device) +with model_alora.disable_adapter(): + output = model_alora.generate(**inputs_base, past_key_values=cache_copy) +output_text_base = tokenizer.decode(output[0]) +``` + +### Weight-Decomposed Low-Rank Adaptation (DoRA) + +This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. + +```py +from peft import LoraConfig + +config = LoraConfig(use_dora=True, ...) +``` + +If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using `ephemeral_gpu_offload=True` in `config.runtime_config`. + +```py +from peft import LoraConfig, LoraRuntimeConfig + +config = LoraConfig(use_dora=True, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), ...) +``` + +A `PeftModel` with a DoRA adapter can also be loaded with `ephemeral_gpu_offload=True` flag using the `from_pretrained` method as well as the `load_adapter` method. + +```py +from peft import PeftModel + +model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=True) +``` + +DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the +base result at those times to get the speedup. +Running [dora finetuning](https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py) +with `CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora` +on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations: + +| | Without Optimization | With Optimization | +| :--: | :--: | :--: | +| train_runtime | 359.7298 | **279.2676** | +| train_samples_per_second | 1.779 | **2.292** | +| train_steps_per_second | 0.056 | **0.072** | + +#### Caveats + +- DoRA only supports embedding, linear, and Conv2d layers at the moment. +- DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see [`LoraModel.merge_and_unload`]. +- DoRA should work with weights quantized with bitsandbytes ("QDoRA"). However, issues have been reported when using QDoRA with DeepSpeed Zero2. + +### QLoRA-style training + +The default LoRA settings in PEFT add trainable weights to the query and value layers of each attention block. But [QLoRA](https://hf.co/papers/2305.14314), which adds trainable weights to all the linear layers of a transformer model, can provide performance equal to a fully finetuned model. To apply LoRA to all the linear layers, like in QLoRA, set `target_modules="all-linear"` (easier than specifying individual modules by name which can vary depending on the architecture). + +```py +config = LoraConfig(target_modules="all-linear", ...) +``` + +### Memory efficient Layer Replication with LoRA + +An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the [SOLAR](https://huggingface.co/papers/2312.15166) paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the `layer_replication` argument. + +```py +config = LoraConfig(layer_replication=[[0,4], [2,5]], ...) +``` + +Assuming the original model had 5 layers `[0, 1, 2 ,3, 4]`, this would create a model with 7 layers arranged as `[0, 1, 2, 3, 2, 3, 4]`. This follows the [mergekit](https://github.com/arcee-ai/mergekit) pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters. + +[Fewshot-Metamath-OrcaVicuna-Mistral-10B](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B) is an example of a model trained using this method on Mistral-7B expanded to 10B. The +[adapter_config.json](https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json) shows a sample LoRA adapter config applying this method for fine-tuning. + +### Fine grained control over ranks and alpha (scaling) + +By default, all layers targeted with LoRA will have the same rank `r` and the same `lora_alpha` (which determines the LoRA scaling), depending on what was specified in the [`LoraConfig`]. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the `rank_pattern` and `alpha_pattern` arguments to [`LoraConfig`]. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be [regular expressions](https://docs.python.org/3/library/re.html) (regex). All LoRA layers that are not explicitly mentioned in `rank_pattern` and `alpha_pattern` will take the default `r` and `lora_alpha` values. + +To give an example, let's assume that we have a model with the following structure: + +```python +>>> print(model) +Outer( + (foo): Linear(...) + (module): Middle( + (foo): Linear(...) + (foobar): Linear(...) + (module): Inner( + (foo): Linear(...) + (barfoo): Linear(...) + ) + ) +) +``` + +- `rank_pattern={"foo": 42}` will match all 3 `foo` layers. Neither `foobar` nor `barfoo` are matched. +- `rank_pattern={"^foo": 42}` will only match the `foo` layer of the model, but neither `module.foo` nor `module.module.foo`. This is because the `^` means "start of string" when using regular expressions, and only `foo` starts with `"foo"`, the other layer names have prefixes. +- `rank_pattern={"^module.foo": 42}` matches only `module.foo`, but not `module.module.foo`, for the same reason. +- `rank_pattern={"module.foo": 42}` matches both `module.foo` and `module.module.foo`, but not `foo`. +- `rank_pattern={"^foo": 42, "^module.module.foo": 55}` matches `foo` and `module.module.foo`, respectively, but not `module.foo`. +- There is no need to indicate `$` to mark the end of the match, as this is added automatically by PEFT. + +The same logic applies to `alpha_pattern`. If you're in doubt, don't try to get fancy with regular expressions -- just pass the full name for each module with a different rank/alpha, preceded by the `^` prefix, and you should be good. + +### Targeting `nn.Parameter` directly + +> [!WARNING] +> This feature is experimental and subject to change. + +Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with `target_parameters`. As an example, for [Llama4](https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164), you can pass: `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. + +#### Caveats + +- At the moment, this argument allows to target 2-dim or 3-dim `nn.Parameter`s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension. +- It is currently not possible to add multiple LoRA adapters (via `model.add_adapter` or `model.load_adapter`) that use `target_parameters` at the same time. + +## Optimizers + +LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+. + +### LoRA-FA Optimizer + +LoRA training can be more effective and efficient using LoRA-FA, as described in [LoRA-FA](https://huggingface.co/papers/2308.03303). LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption. + +```py +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_lorafa_optimizer +from transformers import Trainer, get_cosine_schedule_with_warmup + +base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") + +config = LoraConfig(...) +model = get_peft_model(base_model, config) + +optimizer = create_lorafa_optimizer( + model=model, + r=128, + lora_alpha=32, + lr=7e-5, +) + +scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=100, + num_training_steps=1000, +) + +trainer = Trainer( + ..., + optimizers=(optimizer, scheduler), +) +``` + +### LoRA+ optimized LoRA + +LoRA training can be optimized using [LoRA+](https://huggingface.co/papers/2402.12354), which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%. + +```py +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_loraplus_optimizer +from transformers import Trainer +import bitsandbytes as bnb + +base_model = ... +config = LoraConfig(...) +model = get_peft_model(base_model, config) + +optimizer = create_loraplus_optimizer( + model=model, + optimizer_cls=bnb.optim.Adam8bit, + lr=5e-5, + loraplus_lr_ratio=16, +) +scheduler = None + +... +trainer = Trainer( + ..., + optimizers=(optimizer, scheduler), +) +``` + +## Efficiently train tokens alongside LoRA + +Sometimes it is necessary to not only change some layer's weights but to add new tokens as well. With larger models this can be a memory-costly endeavour. PEFT LoRA adapters support the `trainable_token_indices` parameter which allows tuning of other tokens alongside fine-tuning of specific layers with LoRA. This method only trains the tokens you specify and leaves all other tokens untouched. This saves memory and doesn't throw away learned context of existing token embeddings in contrast to when training the whole embedding matrix. Under the hood this method uses the layer of [`TrainableTokensModel`]. + +```py +# for layer 'embed_tokens' +config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...) + +# specific embedding layer +config = LoraConfig(trainable_token_indices={'emb_tokens': [idx_1, idx_2, ...]}, ...) +``` + +In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model. + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import get_peft_model, LoraConfig + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + +# we define our new tokens and add them to the tokenizer as special tokens +special_tokens = ['<|start_think|>', '<|stop_think|>'] +tokenizer.add_special_tokens({'additional_special_tokens': special_tokens}) + +# make room for new tokens in the embedding matrix if it isn't big enough already +base_model.resize_token_embeddings(max(len(tokenizer), base_model.model.embed_tokens.num_embeddings)) + +# typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens` +# and specifically our new tokens we just added +lora_config = LoraConfig( + target_modules='all-linear', + trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(special_tokens)}, +) +peft_model = get_peft_model(base_model, lora_config) + +# proceed to train the model like normal +[...] +``` + +The token weights are part of your adapter state dict and saved alongside the LoRA weights. +If we would have used full fine-tuning with `modules_to_save=['embed_tokens']` we would have stored the full embedding matrix in the checkpoint, leading to a much bigger file. + +To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (`modules_to_save=["embed_tokens"]`), using a LoRA for the embedding matrix (`target_modules=[..., "embed_tokens"]`, rank 32) and trainable tokens (`trainable_token_indices=[...]`, 6 tokens). Trainable tokens used about as much VRAM (15,562MB vs. 15,581MB) as LoRA while being specific to the tokens and saved ~1GB of VRAM over fully training the embedding matrix. + + +## Merge LoRA weights into the base model + +While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the [`~LoraModel.merge_and_unload`] function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The [`~LoraModel.merge_and_unload`] function doesn't keep the adapter weights in memory. + +Below is a diagram that explains the intuition of LoRA adapter merging: + +
+ +
+ +We show in the snippets below how to run that using PEFT. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) +model.merge_and_unload() +``` + +If you need to keep a copy of the weights so you can unmerge the adapter later or delete and load different ones, you should use the [`~LoraModel.merge_adapter`] function instead. Now you have the option to use [`~LoraModel.unmerge_adapter`] to return the base model. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) +model.merge_adapter() + +# unmerge the LoRA layers from the base model +model.unmerge_adapter() +``` + +The [`~LoraModel.add_weighted_adapter`] function is useful for merging multiple LoRAs into a new adapter based on a user provided weighting scheme in the `weights` parameter. Below is an end-to-end example. + +First load the base model: + +```python +from transformers import AutoModelForCausalLM +from peft import PeftModel +import torch + +base_model = AutoModelForCausalLM.from_pretrained( + "mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto" +) +``` + +Then we load the first adapter: + +```python +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name="sft") +``` + +Then load a different adapter and merge it with the first one: + +```python +weighted_adapter_name = "sft-dpo" +model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") +model.add_weighted_adapter( + adapters=["sft", "dpo"], + weights=[0.7, 0.3], + adapter_name=weighted_adapter_name, + combination_type="linear" +) +model.set_adapter(weighted_adapter_name) +``` + +> [!TIP] +> There are several supported methods for `combination_type`. Refer to the [documentation](../package_reference/lora#peft.LoraModel.add_weighted_adapter) for more details. Note that "svd" as the `combination_type` is not supported when using `torch.float16` or `torch.bfloat16` as the datatype. + +Now, perform inference: + +```python +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") + +prompt = "Hey, are you conscious? Can you talk to me?" +inputs = tokenizer(prompt, return_tensors="pt") +inputs = {k: v.to(device) for k, v in inputs.items()} + +with torch.no_grad(): + generate_ids = model.generate(**inputs, max_length=30) +outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] +print(outputs) +``` + +## Load adapters + +Adapters can be loaded onto a pretrained model with [`~PeftModel.load_adapter`], which is useful for trying out different adapters whose weights aren't merged. Set the active adapter weights with the [`~LoraModel.set_adapter`] function. + +```py +from transformers import AutoModelForCausalLM +from peft import PeftModel + +base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") +peft_model_id = "alignment-handbook/zephyr-7b-sft-lora" +model = PeftModel.from_pretrained(base_model, peft_model_id) + +# load different adapter +model.load_adapter("alignment-handbook/zephyr-7b-dpo-lora", adapter_name="dpo") + +# set adapter as active +model.set_adapter("dpo") +``` + +To return the base model, you could use [`~LoraModel.unload`] to unload all of the LoRA modules or [`~LoraModel.delete_adapter`] to delete the adapter entirely. + +```py +# unload adapter +model.unload() + +# delete adapter +model.delete_adapter("dpo") +``` + +## Inference with different LoRA adapters in the same batch + +Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch. + +Thankfully, it is possible to mix different LoRA adapters in the same batch using the `adapter_name` argument. Below, we show an example of how this works in practice. First, let's load the base model, English, and the two adapters, French and German, like this: + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft import PeftModel + +model_id = ... +tokenizer = AutoTokenizer.from_pretrained(model_id) + +model = AutoModelForCausalLM.from_pretrained(model_id) +# load the LoRA adapter for French +peft_model = PeftModel.from_pretrained(model, , adapter_name="adapter_fr") +# next, load the LoRA adapter for German +peft_model.load_adapter(, adapter_name="adapter_de") +``` + +Now, we want to generate text on a sample that contains all three languages: The first three samples are in English, the next three are in French, and the last three are in German. We can use the `adapter_names` argument to specify which adapter to use for each sample. Since our base model is used for English, we use the special string `"__base__"` for these samples. For the next three samples, we indicate the adapter name of the French LoRA fine-tune, in this case `"adapter_fr"`. For the last three samples, we indicate the adapter name of the German LoRA fine-tune, in this case `"adapter_de"`. This way, we can use the base model and the two adapters in a single batch. + +```python +inputs = tokenizer( + [ + "Hello, my dog is cute", + "Hello, my cat is awesome", + "Hello, my fish is great", + "Salut, mon chien est mignon", + "Salut, mon chat est génial", + "Salut, mon poisson est super", + "Hallo, mein Hund ist süß", + "Hallo, meine Katze ist toll", + "Hallo, mein Fisch ist großartig", + ], + return_tensors="pt", + padding=True, +) + +adapter_names = [ + "__base__", "__base__", "__base__", + "adapter_fr", "adapter_fr", "adapter_fr", + "adapter_de", "adapter_de", "adapter_de", +] +output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_tokens=20) +``` + +Note that the order does not matter here, i.e. the samples in the batch don't need to be grouped by adapter as in the example above. We just need to ensure that the `adapter_names` argument is aligned correctly with the samples. + +Additionally, the same approach also works with the `modules_to_save` feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters. + +### Caveats + +Using this feature has some drawbacks, namely: + +- It only works for inference, not for training. +- Disabling adapters using the `with model.disable_adapter()` context takes precedence over `adapter_names`. +- You cannot pass `adapter_names` when some adapter weights were merged with base weight using the `merge_adapter` method. Please unmerge all adapters first by calling `model.unmerge_adapter()`. +- For obvious reasons, this cannot be used after calling `merge_and_unload()`, since all the LoRA adapters will be merged into the base weights in this case. +- This feature does not currently work with DoRA, so set `use_dora=False` in your `LoraConfig` if you want to use it. +- The `modules_to_save` feature is currently only supported for the layers of types `Linear`, `Embedding`, `Conv2d` and `Conv1d`. +- There is an expected overhead for inference with `adapter_names`, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following: + - Increase the batch size. + - Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters. + - Take a look at alternative implementations such as [LoRAX](https://github.com/predibase/lorax), [punica](https://github.com/punica-ai/punica), or [S-LoRA](https://github.com/S-LoRA/S-LoRA), which are specialized to work with a large number of different adapters. + +## Composing and Reusing LoRA Adapters +### Arrow +[Arrow](https://huggingface.co/papers/2405.11157) is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a **gradient-free, token-wise mixture-of-experts (MoE) routing mechanism**. At inference time, it first computes a _prototype_ for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization. + +In PEFT, Arrow is enabled through ```ArrowConfig``` and ```create_arrow_model```. You can also configure parameters such as ```top_k``` (the number of LoRA adapters combined per token), ```router_temperature``` (the softmax temperature applied to the routing coefficients), and ```rng_seed``` (for reproducibility). + +```py +from peft import create_arrow_model, ArrowConfig +from transformers import AutoModelForCausalLM + +# Loading the model +base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + +# Creating the Arrow config +arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + rng_seed=42, +) + +# The LoRA adapters below were trained on a clustered FLAN dataset. +# Task clustering was performed using the Model-Based Clustering (MBC) method, +# as described in the Arrow paper. +# While one could train a separate LoRA for each task and let Arrow route tokens among them, +# training LoRAs on clusters of tasks instead provides an indirect optimization for +# transfer across the multi-task dataset. +task_specific_adapter_paths = [ + f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) + ] + +# Creating the Arrow model +model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=task_specific_adapter_paths, + arrow_config=arrow_config, + ) + +# Now the forward path could be called on this model, like a normal PeftModel. +``` + +Furthermore, you can add or remove adapters after calling ```create_arrow_model```—for example, to fine-tune a new adapter or discard an unnecessary one. Once the adapters are in place, you can activate the ```"arrow_router"``` for inference to use Arrow. Note that if you add a new LoRA adapter after ```create_arrow_model``` and want to fine-tune it, you must explicitly set the new adapter as active, since ```"arrow_router"``` is activated by default in ```create_arrow_model```. + +```py +from trl import SFTTrainer, SFTConfig + +# Adding a new adapter and activating it +model.add_adapter(adapter_name='new_adapter') +model.set_adapter('new_adapter') + +# Now the model could be trained along the `new_adapter`. +trainer = SFTTrainer( + model=model, + args=SFTConfig(...), + ... + ) + +# Once the training is done, you can activate `arrow_router` and use it in inference +model.set_adapter('arrow_router') # Model is ready to be used at inference time now +``` + +### GenKnowSub +[GenKnowSub](https://aclanthology.org/2025.acl-short.54/) augments Arrow by purifying task-specific LoRA adapters before routing. The key idea is to subtract general knowledge encoded in LoRA space—based on the [forgetting-via-negation principle](https://huggingface.co/papers/2212.04089)—so that task adapters become more isolated and focused on task-relevant signals. Concretely, GenKnowSub estimates a low-dimensional “general” subspace from a set of general (non task-specific) LoRA adapters and removes this component from each task adapter’s LoRA update prior to Arrow’s token-wise routing. This typically improves compositionality and reduces interference when combining many task adapters. + +In PEFT, enable GenKnowSub by setting ```use_gks=True``` in ArrowConfig, and providing ```general_adapter_paths``` in ```create_arrow_model```: + +```py +from peft import create_arrow_model, ArrowConfig +from transformers import AutoModelForCausalLM + +# Loading the model +base_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + +# Creating the Arrow config +arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + use_gks=True, + rng_seed=42, +) + +# Path to task-specific, trained on flan clustered dataset (as we explained before.) +task_specific_adapter_paths = [ + f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10) + ] +# These general adapters are trained on English, German, and French Wikipedia dataset, +# with causal language modelling objective, each pair like: (507 token tsentence, 5 token completion), and the loss computed on the completion +general_adapter_paths = [ + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17" + ] + +# Creating the Arrow model +model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=task_specific_adapter_paths, + general_adapter_paths=general_adapter_paths, + arrow_config=arrow_config, + ) + +# Now the forward path could be called on this model, like a normal PeftModel. +``` +To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling ```create_arrow_model``` (as described in the Arrow section) is still supported in this case. + +> [!TIP] +> **Things to keep in mind when using Arrow + GenKnowSub:** +> +> - All LoRA adapters (task-specific and general) must share the same ```rank``` and ```target_modules```. +> +> - Any inconsistency in these settings will raise an error in ```create_arrow_model```. +> +> - Having different scaling factors (```lora_alpha```) across task adapters is supported — Arrow handles them automatically. +> +> - Merging the ```"arrow_router"``` is not supported, due to its dynamic routing behavior. +> +> - In create_arrow_model, task adapters are loaded as ```task_i``` and general adapters as ```gks_j``` (where ```i``` and ```j``` are indices). The function ensures consistency of ```target_modules```, ```rank```, and whether adapters are applied to ```Linear``` or ```Linear4bit``` layers. It then adds the ```"arrow_router"``` module and activates it. Any customization of this process requires overriding ```create_arrow_model```. +> +> - This implementation is compatible with 4-bit quantization (via bitsandbytes): +> +> ```py +> from transformers import AutoModelForCausalLM, BitsAndBytesConfig +> import torch +> +> # Quantisation config +> bnb_config = BitsAndBytesConfig( +> load_in_4bit=True, +> bnb_4bit_quant_type="nf4", +> bnb_4bit_compute_dtype=torch.bfloat16, +> bnb_4bit_use_double_quant=False, +> ) +> +> # Loading the model +> base_model = AutoModelForCausalLM.from_pretrained( +> "microsoft/Phi-3-mini-4k-instruct", +> torch_dtype=torch.bfloat16, +> device_map="auto", +> quantization_config=bnb_config, +> ) +> +> # Now call create_arrow_model() as we explained before. +> ``` \ No newline at end of file diff --git a/peft/docs/source/developer_guides/low_level_api.md b/peft/docs/source/developer_guides/low_level_api.md new file mode 100644 index 0000000000000000000000000000000000000000..2e3236cd18207e3f5ba60d291deacf38458a8251 --- /dev/null +++ b/peft/docs/source/developer_guides/low_level_api.md @@ -0,0 +1,148 @@ + + +# Adapter injection + +With PEFT, you can inject trainable adapters into any `torch` module which allows you to use adapter methods without relying on the modeling classes in PEFT. This works for all adapters except for those based on prompt learning (e.g. prefix tuning or p-tuning). + +Check the table below to see when you should inject adapters. + +| Pros | Cons | +|---|---| +| the model is modified inplace, keeping all the original attributes and methods | manually write the `from_pretrained` and `save_pretrained` utility functions from Hugging Face to save and load adapters | +| works for any `torch` module and modality | doesn't work with any of the utility methods provided by `PeftModel` such as disabling and merging adapters | + +## Creating a new PEFT model + +To perform the adapter injection, use the [`inject_adapter_in_model`] method. This method takes 3 arguments, the PEFT config, the model, and an optional adapter name. You can also attach multiple adapters to the model if you call [`inject_adapter_in_model`] multiple times with different adapter names. + +For example, to inject LoRA adapters into the `linear` submodule of the `DummyModel` module: + +```python +import torch +from peft import inject_adapter_in_model, LoraConfig + +class DummyModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.embedding = torch.nn.Embedding(10, 10) + self.linear = torch.nn.Linear(10, 10) + self.lm_head = torch.nn.Linear(10, 10) + + def forward(self, input_ids): + x = self.embedding(input_ids) + x = self.linear(x) + x = self.lm_head(x) + return x + + +lora_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + target_modules=["linear"], +) + +model = DummyModel() +model = inject_adapter_in_model(lora_config, model) + +dummy_inputs = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]) +dummy_outputs = model(dummy_inputs) +``` + +Print the model to see that the adapters have been correctly injected. + +```bash +DummyModel( + (embedding): Embedding(10, 10) + (linear): Linear( + in_features=10, out_features=10, bias=True + (lora_dropout): ModuleDict( + (default): Dropout(p=0.1, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=10, out_features=64, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=64, out_features=10, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + ) + (lm_head): Linear(in_features=10, out_features=10, bias=True) +) +``` + +### Injection based on a `state_dict` + +Sometimes, it is possible that there is a PEFT adapter checkpoint but the corresponding PEFT config is not known for whatever reason. To inject the PEFT layers for this checkpoint, you would usually have to reverse-engineer the corresponding PEFT config, most notably the `target_modules` argument, based on the `state_dict` from the checkpoint. This can be cumbersome and error prone. To avoid this, it is also possible to call [`inject_adapter_in_model`] and pass the loaded `state_dict` as an argument: + +```python +from safetensors.torch import load_file + +model = ... +state_dict = load_file() +lora_config = LoraConfig(...) +model = inject_adapter_in_model(lora_config, model, state_dict=state_dict) +``` + +In this case, PEFT will use the `state_dict` as reference for which layers to target instead of using the PEFT config. As a user, you don't have to set the exact `target_modules` of the PEFT config for this to work. However, you should still pass a PEFT config of the right type, in this example `LoraConfig`, you can leave the `target_modules` as `None`. + +Be aware that this still only creates the uninitialized PEFT layers, the values from the `state_dict` are not used to populate the model weights. To populate the weights, proceed with calling [`set_peft_model_state_dict`] as described below. + +⚠️ Note that if there is a mismatch between what is configured in the PEFT config and what is found in the `state_dict`, PEFT will warn you about this. You can ignore the warning if you know that the PEFT config is not correctly specified. + +> [!WARNING] +> If the original PEFT adapters was using `target_parameters` instead of `target_modules`, injecting from a `state_dict` will not work correctly. In this case, it is mandatory to use the correct PEFT config for injection. + +## Saving the model + +To only save the adapter, use the [`get_peft_model_state_dict`] function: + +```python +from peft import get_peft_model_state_dict + +peft_state_dict = get_peft_model_state_dict(model) +print(peft_state_dict) +``` + +Otherwise, `model.state_dict()` returns the full state dict of the model. + +## Loading the model + +After loading the saved `state_dict`, it can be applied using the [`set_peft_model_state_dict`] function: + +```python +from peft import set_peft_model_state_dict + +model = DummyModel() +model = inject_adapter_in_model(lora_config, model) +outcome = set_peft_model_state_dict(model, peft_state_dict) +# check that there were no wrong keys +print(outcome.unexpected_keys) +``` + +If injecting the adapter is slow or you need to load a large number of adapters, you may use an optimization that allows to create an "empty" adapter on meta device and only fills the weights with real weights when the [`set_peft_model_state_dict`] is called. To do this, pass `low_cpu_mem_usage=True` to both [`inject_adapter_in_model`] and [`set_peft_model_state_dict`]. + +```python +model = DummyModel() +model = inject_adapter_in_model(lora_config, model, low_cpu_mem_usage=True) + +print(model.linear.lora_A["default"].weight.device.type == "meta") # should be True +set_peft_model_state_dict(model, peft_state_dict, low_cpu_mem_usage=True) +print(model.linear.lora_A["default"].weight.device.type == "cpu") # should be True +``` diff --git a/peft/docs/source/developer_guides/mixed_models.md b/peft/docs/source/developer_guides/mixed_models.md new file mode 100644 index 0000000000000000000000000000000000000000..c42cf1416cb389867da3d04a90a8b00c8c8bfcb5 --- /dev/null +++ b/peft/docs/source/developer_guides/mixed_models.md @@ -0,0 +1,37 @@ + + +# Mixed adapter types + +Normally, it isn't possible to mix different adapter types in 🤗 PEFT. You can create a PEFT model with two different LoRA adapters (which can have different config options), but it is not possible to combine a LoRA and LoHa adapter. With [`PeftMixedModel`] however, this works as long as the adapter types are compatible. The main purpose of allowing mixed adapter types is to combine trained adapters for inference. While it is possible to train a mixed adapter model, this has not been tested and is not recommended. + +To load different adapter types into a PEFT model, use [`PeftMixedModel`] instead of [`PeftModel`]: + +```py +from peft import PeftMixedModel + +base_model = ... # load the base model, e.g. from transformers +# load first adapter, which will be called "default" +peft_model = PeftMixedModel.from_pretrained(base_model, ) +peft_model.load_adapter(, adapter_name="other") +peft_model.set_adapter(["default", "other"]) +``` + +The [`~PeftMixedModel.set_adapter`] method is necessary to activate both adapters, otherwise only the first adapter would be active. You can keep adding more adapters by calling [`~PeftModel.add_adapter`] repeatedly. + +[`PeftMixedModel`] does not support saving and loading mixed adapters. The adapters should already be trained, and loading the model requires a script to be run each time. + +## Tips + +- Not all adapter types can be combined. See [`peft.tuners.mixed.COMPATIBLE_TUNER_TYPES`](https://github.com/huggingface/peft/blob/1c1c7fdaa6e6abaa53939b865dee1eded82ad032/src/peft/tuners/mixed/model.py#L35) for a list of compatible types. An error will be raised if you try to combine incompatible adapter types. +- It is possible to mix multiple adapters of the same type which can be useful for combining adapters with very different configs. +- If you want to combine a lot of different adapters, the most performant way to do it is to consecutively add the same adapter types. For example, add LoRA1, LoRA2, LoHa1, LoHa2 in this order, instead of LoRA1, LoHa1, LoRA2, and LoHa2. While the order can affect the output, there is no inherently *best* order, so it is best to choose the fastest one. diff --git a/peft/docs/source/developer_guides/model_merging.md b/peft/docs/source/developer_guides/model_merging.md new file mode 100644 index 0000000000000000000000000000000000000000..31cda64a09a597259544317da41c0837441ab144 --- /dev/null +++ b/peft/docs/source/developer_guides/model_merging.md @@ -0,0 +1,164 @@ + + +# Model merging + +Training a model for each task can be costly, take up storage space, and the models aren't able to learn new information to improve their performance. Multitask learning can overcome some of these limitations by training a model to learn several tasks, but it is expensive to train and designing a dataset for it is challenging. *Model merging* offers a solution to these challenges by combining multiple pretrained models into one model, giving it the combined abilities of each individual model without any additional training. + +PEFT provides several methods for merging models like a linear or SVD combination. This guide focuses on two methods that are more efficient for merging LoRA adapters by eliminating redundant parameters: + +* [TIES](https://hf.co/papers/2306.01708) - TrIm, Elect, and Merge (TIES) is a three-step method for merging models. First, redundant parameters are trimmed, then conflicting signs are resolved into an aggregated vector, and finally the parameters whose signs are the same as the aggregate sign are averaged. This method takes into account that some values (redundant and sign disagreement) can degrade performance in the merged model. +* [DARE](https://hf.co/papers/2311.03099) - Drop And REscale is a method that can be used to prepare for other model merging methods like TIES. It works by randomly dropping parameters according to a drop rate and rescaling the remaining parameters. This helps to reduce the number of redundant and potentially interfering parameters among multiple models. + +Models are merged with the [`~LoraModel.add_weighted_adapter`] method, and the specific model merging method is specified in the `combination_type` parameter. + +## Merge method + +With TIES and DARE, merging is enabled by setting `combination_type` and `density` to a value of the weights to keep from the individual models. For example, let's merge three finetuned [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) models: [tinyllama_lora_nobots](https://huggingface.co/smangrul/tinyllama_lora_norobots), [tinyllama_lora_sql](https://huggingface.co/smangrul/tinyllama_lora_sql), and [tinyllama_lora_adcopy](https://huggingface.co/smangrul/tinyllama_lora_adcopy). + + + +When you're attempting to merge fully trained models with TIES, you should be aware of any special tokens each model may have added to the embedding layer which are not a part of the original checkpoint's vocabulary. This may cause an issue because each model may have added a special token to the same embedding position. If this is the case, you should use the [`~transformers.PreTrainedModel.resize_token_embeddings`] method to avoid merging the special tokens at the same embedding index. + +
+ +This shouldn't be an issue if you're only merging LoRA adapters trained from the same base model. + +
+ +Load a base model and can use the [`~PeftModel.load_adapter`] method to load and assign each adapter a name: + +```py +from peft import PeftConfig, PeftModel +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +config = PeftConfig.from_pretrained("smangrul/tinyllama_lora_norobots") +model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, load_in_4bit=True, device_map="auto").eval() +tokenizer = AutoTokenizer.from_pretrained("smangrul/tinyllama_lora_norobots") + +model.config.vocab_size = 32005 +model.resize_token_embeddings(32005) + +model = PeftModel.from_pretrained(model, "smangrul/tinyllama_lora_norobots", adapter_name="norobots") +_ = model.load_adapter("smangrul/tinyllama_lora_sql", adapter_name="sql") +_ = model.load_adapter("smangrul/tinyllama_lora_adcopy", adapter_name="adcopy") +``` + +Set the adapters, weights, `adapter_name`, `combination_type`, and `density` with the [`~LoraModel.add_weighted_adapter`] method. + + + + +Weight values greater than `1.0` typically produce better results because they preserve the correct scale. A good default starting value for the weights is to set all values to `1.0`. + +```py +adapters = ["norobots", "adcopy", "sql"] +weights = [2.0, 1.0, 1.0] +adapter_name = "merge" +density = 0.2 +model.add_weighted_adapter(adapters, weights, adapter_name, combination_type="ties", density=density) +``` + + + + +```py +adapters = ["norobots", "adcopy", "sql"] +weights = [2.0, 0.3, 0.7] +adapter_name = "merge" +density = 0.2 +model.add_weighted_adapter(adapters, weights, adapter_name, combination_type="dare_ties", density=density) +``` + + + + +Set the newly merged model as the active model with the [`~LoraModel.set_adapter`] method. + +```py +model.set_adapter("merge") +``` + +Now you can use the merged model as an instruction-tuned model to write ad copy or SQL queries! + + + + +```py +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +messages = [ + {"role": "user", "content": "Write an essay about Generative AI."}, +] +text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) +inputs = tokenizer(text, return_tensors="pt") +inputs = {k: v.to(device) for k, v in inputs.items()} +outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id) +print(tokenizer.decode(outputs[0])) +``` + + + + +```py +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +messages = [ + {"role": "system", "content": "Create a text ad given the following product and description."}, + {"role": "user", "content": "Product: Sony PS5 PlayStation Console\nDescription: The PS5 console unleashes new gaming possibilities that you never anticipated."}, +] +text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) +inputs = tokenizer(text, return_tensors="pt") +inputs = {k: v.to(device) for k, v in inputs.items()} +outputs = model.generate(**inputs, max_new_tokens=128, do_sample=True, top_p=0.95, temperature=0.2, repetition_penalty=1.2, eos_token_id=tokenizer.eos_token_id) +print(tokenizer.decode(outputs[0])) +``` + + + + +```py +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +text = """Table: 2-11365528-2 +Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location'] +Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic? +SQL Query:""" + +inputs = tokenizer(text, return_tensors="pt") +inputs = {k: v.to(device) for k, v in inputs.items()} +outputs = model.generate(**inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer("").input_ids[-1]) +print(tokenizer.decode(outputs[0])) +``` + + + + + +## Merging (IA)³ Models +The (IA)³ models facilitate linear merging of adapters. To merge adapters in an (IA)³ model, utilize the `add_weighted_adapter` method from the `IA3Model` class. This method is analogous to the `add_weighted_adapter` method used in `LoraModel`, with the key difference being the absence of the `combination_type` parameter. For example, to merge three (IA)³ adapters into a PEFT model, you would proceed as follows: + +```py +adapters = ["adapter1", "adapter2", "adapter3"] +weights = [0.4, 0.3, 0.3] +adapter_name = "merge" +model.add_weighted_adapter(adapters, weights, adapter_name) +``` + +It is recommended that the weights sum to 1.0 to preserve the scale of the model. The merged model can then be set as the active model using the `set_adapter` method: + +```py +model.set_adapter("merge") +``` diff --git a/peft/docs/source/developer_guides/quantization.md b/peft/docs/source/developer_guides/quantization.md new file mode 100644 index 0000000000000000000000000000000000000000..b14abecc3a6fc222310a8a07331f72242e72bf6b --- /dev/null +++ b/peft/docs/source/developer_guides/quantization.md @@ -0,0 +1,294 @@ + + +# Quantization + +Quantization represents data with fewer bits, making it a useful technique for reducing memory-usage and accelerating inference especially when it comes to large language models (LLMs). There are several ways to quantize a model including: + +* optimizing which model weights are quantized with the [AWQ](https://hf.co/papers/2306.00978) algorithm +* independently quantizing each row of a weight matrix with the [GPTQ](https://hf.co/papers/2210.17323) algorithm +* quantizing to 8-bit and 4-bit precision with the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library +* quantizing to as low as 2-bit precision with the [AQLM](https://huggingface.co/papers/2401.06118) algorithm + +However, after a model is quantized it isn't typically further trained for downstream tasks because training can be unstable due to the lower precision of the weights and activations. But since PEFT methods only add *extra* trainable parameters, this allows you to train a quantized model with a PEFT adapter on top! Combining quantization with PEFT can be a good strategy for training even the largest models on a single GPU. For example, [QLoRA](https://hf.co/papers/2305.14314) is a method that quantizes a model to 4-bits and then trains it with LoRA. This method allows you to finetune a 65B parameter model on a single 48GB GPU! + +In this guide, you'll see how to quantize a model to 4-bits and train it with LoRA. + +## Quantize a model + +[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is a quantization library with a Transformers integration. With this integration, you can quantize a model to 8 or 4-bits and enable many other options by configuring the [`~transformers.BitsAndBytesConfig`] class. For example, you can: + +* set `load_in_4bit=True` to quantize the model to 4-bits when you load it +* set `bnb_4bit_quant_type="nf4"` to use a special 4-bit data type for weights initialized from a normal distribution +* set `bnb_4bit_use_double_quant=True` to use a nested quantization scheme to quantize the already quantized weights +* set `bnb_4bit_compute_dtype=torch.bfloat16` to use bfloat16 for faster computation + +```py +import torch +from transformers import BitsAndBytesConfig + +config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, +) +``` + +Pass the `config` to the [`~transformers.AutoModelForCausalLM.from_pretrained`] method. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config) +``` + +Next, you should call the [`~peft.utils.prepare_model_for_kbit_training`] function to preprocess the quantized model for training. + +```py +from peft import prepare_model_for_kbit_training + +model = prepare_model_for_kbit_training(model) +``` + +Now that the quantized model is ready, let's set up a configuration. + +## LoraConfig + +Create a [`LoraConfig`] with the following parameters (or choose your own): + +```py +from peft import LoraConfig + +config = LoraConfig( + r=16, + lora_alpha=8, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" +) +``` + +Then use the [`get_peft_model`] function to create a [`PeftModel`] from the quantized model and configuration. + +```py +from peft import get_peft_model + +model = get_peft_model(model, config) +``` + +You're all set for training with whichever training method you prefer! + +### LoftQ initialization + +[LoftQ](https://hf.co/papers/2310.08659) initializes LoRA weights such that the quantization error is minimized, and it can improve performance when training quantized models. To get started, follow [these instructions](https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning). + +In general, for LoftQ to work best, it is recommended to target as many layers with LoRA as possible, since those not targeted cannot have LoftQ applied. This means that passing `LoraConfig(..., target_modules="all-linear")` will most likely give the best results. Also, you should use `nf4` as quant type in your quantization config when using 4bit quantization, i.e. `BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")`. + +### QLoRA-style training + +QLoRA adds trainable weights to all the linear layers in the transformer architecture. Since the attribute names for these linear layers can vary across architectures, set `target_modules` to `"all-linear"` to add LoRA to all the linear layers: + +```py +config = LoraConfig(target_modules="all-linear", ...) +``` + +## GPTQ quantization + +You can learn more about gptq based `[2, 3, 4, 8]` bits quantization at [GPTQModel](https://github.com/ModelCloud/GPTQModel) and the Transformers [GPTQ](https://huggingface.co/docs/transformers/quantization/gptq) doc. Post-quant training, PEFT can use both [GPTQModel](https://github.com/ModelCloud/GPTQModel) or [AutoGPTQ](https://github.com/autogptq/autogptq) libraries, but we recommend GPTQModel because AutoGPTQ will be deprecated in a future release. + +```bash +# gptqmodel install +pip install gptqmodel --no-build-isolation +``` + +```py +from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig + +model_id = "facebook/opt-125m" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +gptq_config = GPTQConfig(bits=4, group_size=128, dataset="wikitext2", tokenizer=tokenizer) + +quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config) + +# save quantized model +quantized_model.save_pretrained("./opt-125m-gptq") +tokenizer.save_pretrained("./opt-125m-gptq") +``` + +Once quantized, you can post-train GPTQ models with PEFT APIs. + +## AQLM quantization + +Additive Quantization of Language Models ([AQLM](https://huggingface.co/papers/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes. This allows it to compress models down to as low as 2-bit with considerably low accuracy losses. + +Since the AQLM quantization process is computationally expensive, the use of prequantized models is recommended. A partial list of available models can be found in the official aqlm [repository](https://github.com/Vahe1994/AQLM). + +The models support LoRA adapter tuning. To tune the quantized model you'll need to install the `aqlm` inference library: `pip install aqlm>=1.0.2`. Finetuned LoRA adapters shall be saved separately, as merging them with AQLM quantized weights is not possible. + +```py +quantized_model = AutoModelForCausalLM.from_pretrained( + "BlackSamorez/Mixtral-8x7b-AQLM-2Bit-1x16-hf-test-dispatch", + torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True, +) + +peft_config = LoraConfig(...) + +quantized_model = get_peft_model(quantized_model, peft_config) +``` + +You can refer to the [Google Colab](https://colab.research.google.com/drive/12GTp1FCj5_0SnnNQH18h_2XFh9vS_guX?usp=sharing) example for an overview of AQLM+LoRA finetuning. + +## EETQ quantization + +You can also perform LoRA fine-tuning on EETQ quantized models. [EETQ](https://github.com/NetEase-FuXi/EETQ) package offers simple and efficient way to perform 8-bit quantization, which is claimed to be faster than the `LLM.int8()` algorithm. First, make sure that you have a transformers version that is compatible with EETQ (e.g. by installing it from latest pypi or from source). + +```py +import torch +from transformers import EetqConfig + +config = EetqConfig("int8") +``` + +Pass the `config` to the [`~transformers.AutoModelForCausalLM.from_pretrained`] method. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", quantization_config=config) +``` + +and create a `LoraConfig` and pass it to `get_peft_model`: + +```py +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=8, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" +) + +model = get_peft_model(model, config) +``` + +## HQQ quantization + +The models that are quantized using Half-Quadratic Quantization of Large Machine Learning Models ([HQQ](https://mobiusml.github.io/hqq_blog/)) support LoRA adapter tuning. To tune the quantized model, you'll need to install the `hqq` library with: `pip install hqq`. + +```python +from hqq.engine.hf import HQQModelForCausalLM + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +quantized_model = HQQModelForCausalLM.from_quantized(save_dir_or_hfhub, device=device) +peft_config = LoraConfig(...) +quantized_model = get_peft_model(quantized_model, peft_config) +``` + +Or using transformers version that is compatible with HQQ (e.g. by installing it from latest pypi or from source). + +```python +from transformers import HqqConfig, AutoModelForCausalLM + +quant_config = HqqConfig(nbits=4, group_size=64) +quantized_model = AutoModelForCausalLM.from_pretrained(save_dir_or_hfhub, device_map=device_map, quantization_config=quant_config) +peft_config = LoraConfig(...) +quantized_model = get_peft_model(quantized_model, peft_config) +``` + +## torchao (PyTorch Architecture Optimization) + +PEFT supports models quantized with [torchao](https://github.com/pytorch/ao) ("ao") for int8 quantization. + +```python +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, TorchAoConfig + +model_id = ... +quantization_config = TorchAoConfig(quant_type="int8_weight_only") +base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) +peft_config = LoraConfig(...) +model = get_peft_model(base_model, peft_config) +``` + +### Caveats: + +- Use the most recent versions of torchao (>= v0.4.0) and transformers (> 4.42). +- Only linear layers are currently supported. +- `quant_type = "int4_weight_only"` is currently not supported. +- `NF4` is not implemented in transformers as of yet and is thus also not supported. +- DoRA only works with `quant_type = "int8_weight_only"` at the moment. +- There is explicit support for torchao when used with LoRA. However, when torchao quantizes a layer, its class does not change, only the type of the underlying tensor. For this reason, PEFT methods other than LoRA will generally also work with torchao, even if not explicitly supported. Be aware, however, that **merging only works correctly with LoRA and with `quant_type = "int8_weight_only"`**. If you use a different PEFT method or dtype, merging will likely result in an error, and even it doesn't, the results will still be incorrect. + +## INC quantization + +Intel Neural Compressor ([INC](https://github.com/intel/neural-compressor)) enables model quantization for various devices, +including Intel Gaudi accelerators (also known as HPU devices). You can perform LoRA fine-tuning on models that have been +quantized using INC. To use INC with PyTorch models, install the library with: `pip install neural-compressor[pt]`. +Quantizing a model to FP8 precision for HPU devices can be done with the following single-step quantization workflow: + +```python +import torch +from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare +quant_configs = { + ... +} +config = FP8Config(**quant_configs) +``` + +Pass the config to the `prepare` method, run inference to gather calibration stats, and call `finalize_calibration` +and `convert` methods to quantize model to FP8 precision: + +```python +model = prepare(model, config) +# Run inference to collect calibration statistics +... +# Finalize calibration and convert the model to FP8 precision +finalize_calibration(model) +model = convert(model) +# Load PEFT LoRA adapter as usual +... +``` + +An example demonstrating how to load a PEFT LoRA adapter into an INC-quantized FLUX text-to-image model for HPU +devices is provided [here](https://github.com/huggingface/peft/blob/main/examples/stable_diffusion/inc_flux_lora_hpu.py). + + +### Caveats: + +- `merge()` and `unmerge()` methods are currently not supported for INC-quantized models. +- Currently, only **Linear** INC-quantized layers are supported when loading PEFT adapters. + +## Other Supported PEFT Methods + +Besides LoRA, the following PEFT methods also support quantization: + +- **VeRA** (supports bitsandbytes quantization) +- **AdaLoRA** (supports both bitsandbytes and GPTQ quantization) +- **(IA)³** (supports bitsandbytes quantization) + +## Next steps + +If you're interested in learning more about quantization, the following may be helpful: + +* Learn more details about QLoRA and check out some benchmarks on its impact in the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post. +* Read more about different quantization schemes in the Transformers [Quantization](https://hf.co/docs/transformers/main/quantization) guide. diff --git a/peft/docs/source/developer_guides/torch_compile.md b/peft/docs/source/developer_guides/torch_compile.md new file mode 100644 index 0000000000000000000000000000000000000000..35d7f7cfe8d8c05685a7da2a75e8720308449ff5 --- /dev/null +++ b/peft/docs/source/developer_guides/torch_compile.md @@ -0,0 +1,71 @@ + + +# torch.compile + +In PEFT, [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) works for some but not all features. The reason why it won't always work is because PEFT is highly dynamic in certain places (loading and switching between multiple adapters, for instance), which can cause trouble for `torch.compile`. In other places, `torch.compile` may work, but won't be as fast as expected because of graph breaks. + +If you don't see an error, it doesn't necessarily mean that `torch.compile` worked correctly. It might give you an output, but the output is incorrect. This guide describes what works with `torch.compile` and what doesn't. For your own testing, we recommend using the latest PyTorch version, as `torch.compile` is constantly being improved. + +> [!TIP] +> Unless indicated otherwise, the default `torch.compile` settings were used. + +## Training and inference with `torch.compile` + +These features **work** with `torch.compile`. Everything listed below was tested with a causal LM: + +- Training with `Trainer` from 🤗 transformers +- Training with a custom PyTorch loop +- Inference +- Generation + +The following adapters were tested successfully: + +- AdaLoRA +- BOFT +- Bone +- IA³ +- Layer Norm Tuning +- LoHa +- LoKr +- LoRA +- LoRA + DoRA +- LoRA applied to embedding layers +- OFT +- VeRA +- HRA + +## Advanced PEFT features with `torch.compile` + +Below are some of the more advanced PEFT features that **work**. They were all tested with LoRA. + +- `modules_to_save` (i.e. `config = LoraConfig(..., modules_to_save=...)`) +- Merging adapters (one or multiple) +- Merging multiple adapters into one adapter (i.e. calling `model.add_weighted_adapter(...)`) +- Using PEFT adapters with quantization (bitsandbytes) +- Disabling adapters (i.e. using `with model.disable_adapter()`) +- Unloading (i.e. calling `model.merge_and_unload()`) +- Mixed adapter batches (i.e. calling `model(batch, adapter_names=["__base__", "default", "other", ...])`) +- Inference with multiple adapters (i.e. using `model.add_adapter` or `model.load_adapter` to load more than 1 adapter); for this, only call `torch.compile` _after_ loading all adapters + +Generally, we can expect that if a feature works correctly with LoRA and is also supported by other adapter types, it should also work for that adapter type. + +## Test cases + +All the use cases listed above are tested inside of [`peft/tests/test_torch_compile.py`](https://github.com/huggingface/peft/blob/main/tests/test_torch_compile.py). If you want to check in more detail how we tested a certain feature, please go to that file and check the test that corresponds to your use case. + +> [!TIP] +> If you have another use case where you know that `torch.compile` does or does not work with PEFT, please contribute by letting us know or by opening a PR to add this use case to the covered test cases. diff --git a/peft/docs/source/developer_guides/troubleshooting.md b/peft/docs/source/developer_guides/troubleshooting.md new file mode 100644 index 0000000000000000000000000000000000000000..b69870a11be3581d7d4a0cd4cc11a1595a9694bc --- /dev/null +++ b/peft/docs/source/developer_guides/troubleshooting.md @@ -0,0 +1,458 @@ + + +# Troubleshooting + +If you encounter any issue when using PEFT, please check the following list of common issues and their solutions. + +## Examples don't work + +Examples often rely on the most recent package versions, so please ensure they're up-to-date. In particular, check the following package versions: + +- `peft` +- `transformers` +- `accelerate` +- `torch` + +In general, you can update the package version by running this command inside your Python environment: + +```bash +python -m pip install -U +``` + +Installing PEFT from source is useful for keeping up with the latest developments: + +```bash +python -m pip install git+https://github.com/huggingface/peft +``` + +## Dtype-related issues + +### ValueError: Attempting to unscale FP16 gradients + +This error probably occurred because the model was loaded with `torch_dtype=torch.float16` and then used in an automatic mixed precision (AMP) context, e.g. by setting `fp16=True` in the [`~transformers.Trainer`] class from 🤗 Transformers. The reason is that when using AMP, trainable weights should never use fp16. To make this work without loading the whole model in fp32, add the following to your code: + +```python +peft_model = get_peft_model(...) + +# add this: +for param in model.parameters(): + if param.requires_grad: + param.data = param.data.float() + +# proceed as usual +trainer = Trainer(model=peft_model, fp16=True, ...) +trainer.train() +``` + +Alternatively, you can use the [`~utils.cast_mixed_precision_params`] function to correctly cast the weights: + +```python +from peft import cast_mixed_precision_params + +peft_model = get_peft_model(...) +cast_mixed_precision_params(peft_model, dtype=torch.float16) + +# proceed as usual +trainer = Trainer(model=peft_model, fp16=True, ...) +trainer.train() +``` + +> [!TIP] +> Starting from PEFT version v0.12.0, PEFT automatically promotes the dtype of adapter weights from `torch.float16` and `torch.bfloat16` to `torch.float32` where appropriate. To _prevent_ this behavior, you can pass `autocast_adapter_dtype=False` to [`~get_peft_model`], to [`~PeftModel.from_pretrained`], and to [`~PeftModel.load_adapter`]. + +### Selecting the dtype of the adapter + +Most PEFT methods, like LoRA, work by adding trainable adapter weights. By default, those weights are stored in float32 dtype (fp32), i.e. at a relatively high precision. Therefore, even if the base model is loaded in float16 (fp16) or bfloat16 (bf16), the adapter weights are float32. When the adapter results are calculated during the forward pass, the input will typically be in the dtype of the base model, thus it will be upcast to float32 if necessary, then cast back to the original dtype. + +If you prefer to have the adapter weights in the lower precision of the base model, i.e. in float16 or bfloat16, you can pass `autocast_adapter_dtype=False` when creating the model ([`~get_peft_model`]) or loading the model ([`~PeftModel.from_pretrained`]). There are some advantages and disadvantages to this: + +Advantages of half precision adapter: +- computation slightly faster +- slightly less memory +- smaller file size of checkpoint (half the size) + +Disadvantages of half precision adapter: +- slightly worse loss +- higher risk of overflow or underflow + +Note that for most use cases, overall runtime and memory cost will be determined by the size of the base model and by the dataset, while the dtype of the PEFT adapter will only have a small impact. + +## Bad results from a loaded PEFT model + +There can be several reasons for getting a poor result from a loaded PEFT model which are listed below. If you're still unable to troubleshoot the problem, see if anyone else had a similar [issue](https://github.com/huggingface/peft/issues) on GitHub, and if you can't find any, open a new issue. + +When opening an issue, it helps a lot if you provide a minimal code example that reproduces the issue. Also, please report if the loaded model performs at the same level as the model did before fine-tuning, if it performs at a random level, or if it is only slightly worse than expected. This information helps us identify the problem more quickly. + +### Random deviations + +If your model outputs are not exactly the same as previous runs, there could be an issue with random elements. For example: + +1. please ensure it is in `.eval()` mode, which is important, for instance, if the model uses dropout +2. if you use [`~transformers.GenerationMixin.generate`] on a language model, there could be random sampling, so obtaining the same result requires setting a random seed +3. if you used quantization and merged the weights, small deviations are expected due to rounding errors + +### Incorrectly loaded model + +Please ensure that you load the model correctly. A common error is trying to load a _trained_ model with [`get_peft_model`] which is incorrect. Instead, the loading code should look like this: + +```python +from peft import PeftModel, PeftConfig + +base_model = ... # to load the base model, use the same code as when you trained it +config = PeftConfig.from_pretrained(peft_model_id) +peft_model = PeftModel.from_pretrained(base_model, peft_model_id) +``` + +### Randomly initialized layers + +For some tasks, it is important to correctly configure `modules_to_save` in the config to account for randomly initialized layers. + +As an example, this is necessary if you use LoRA to fine-tune a language model for sequence classification because 🤗 Transformers adds a randomly initialized classification head on top of the model. If you do not add this layer to `modules_to_save`, the classification head won't be saved. The next time you load the model, you'll get a _different_ randomly initialized classification head, resulting in completely different results. + +PEFT tries to correctly guess the `modules_to_save` if you provide the `task_type` argument in the config. This should work for transformers models that follow the standard naming scheme. It is always a good idea to double check though because we can't guarantee all models follow the naming scheme. + +When you load a transformers model that has randomly initialized layers, you should see a warning along the lines of: + +``` +Some weights of were not initialized from the model checkpoint at and are newly initialized: []. +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +``` + +The mentioned layers should be added to `modules_to_save` in the config to avoid the described problem. + +> [!TIP] +> As an example, when loading a model that is using the DeBERTa architecture for sequence classification, you'll see a warning that the following weights are newly initialized: `['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']`. From this, it follows that the `classifier` and `pooler` layers should be added to: `modules_to_save=["classifier", "pooler"]`. + +### Extending the vocabulary + +For many language fine-tuning tasks, extending the model's vocabulary is necessary since new tokens are being introduced. This requires extending the embedding layer to account for the new tokens and, depending on the fine-tuning method, also storing the embedding layer in addition to the adapter weights when saving the adapter. There are a few ways of achieving this ordered by parameter effectiveness: + +- [trainable tokens](../package_reference/trainable_tokens), train only the specified tokens, optionally store only the updated values +- training an adapter on the embedding matrix, optionally store only the updated values +- full-finetuning of the embedding layer + +#### Using trainable tokens + +Let's start with trainable tokens, in this case its [LoRA integration](../developer_guides/lora#efficiently-train-tokens-alongside-lora). If you're interested in only training the new embeddings and nothing else, refer to the [standalone documentation](../package_reference/trainable_tokens). + +To enable selective token training of the embedding layer, you'll need to supply the token ids of your newly added tokens via the `trainable_token_indices` parameter. Optionally you can specify which layer to target if there is more than one embedding layer. For a Mistral model this could look like this: + +```python +new_tokens = ['', ''] +tokenizer.add_tokens(new_tokens) +base_model.resize_token_embeddings(len(tokenizer)) + +lora_config = LoraConfig( + ..., + trainable_token_indices={'embed_tokens': tokenizer.convert_tokens_to_ids(new_tokens)}, +) +``` + +If your model uses tied weights (such as the `lm_head`), trainable tokens will try to resolve those and keep them updated as well, so in that case there should be no need for adding `modules_to_save=["lm_head"]`. This only works if the model uses the Transformers convention for tying weights. + +Saving the model with `model.save_pretrained` may save the full embedding matrix instead of +only the difference as a precaution because the embedding matrix was resized. To save space you can disable this behavior by setting `save_embedding_layers=False` when calling `save_pretrained`. This is safe to do as long as you don't modify the embedding matrix through other means as well, as such changes will be not tracked by trainable tokens. + +#### Using an adapter, e.g. LoRA + +Prepare the embedding layer by adding it to the `target_modules` of your adapter config. For example, the Mistral config could look like this: + +```python +config = LoraConfig(..., target_modules=["embed_tokens", "lm_head", "q_proj", "v_proj"]) +``` + +Once added to `target_modules`, PEFT automatically stores the embedding layer when saving the adapter if the model has the [`~transformers.PreTrainedModel.get_input_embeddings`] and [`~transformers.PreTrainedModel.get_output_embeddings`]. This is generally the case for Transformers models. + +If the model's embedding layer doesn't follow the Transformer's naming scheme but nevertheless implements `get_input_embeddings`, you can still save it by manually passing `save_embedding_layers=True` when saving the adapter: + +```python +model = get_peft_model(...) +# train the model +model.save_pretrained("my_adapter", save_embedding_layers=True) +``` + +For inference, load the base model first and resize it the same way you did before you trained the model. After you've resized the base model, you can load the PEFT checkpoint. + +For a complete example, please check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb). + +#### Full fine-tuning + +Full fine-tuning is more costly in terms of VRAM or storage space but if all else fails, you can fall back to this and see if it works for you. Achieve it by adding the name of the embedding layer to `modules_to_save`. Note that you need to add tied layers as well, e.g. `lm_head`. Example for a Mistral model with LoRA: + +```python +config = LoraConfig(..., modules_to_save=["embed_tokens", "lm_head"], target_modules=["q_proj", "v_proj"]) +``` + +### Getting a warning about "weights not being initialized from the model checkpoint" + +When you load your PEFT model which has been trained on a task (for example, classification), you may get a warning like: + +> Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + +Although this looks scary, it is most likely nothing to worry about. This warning comes from Transformers, and it isn't a PEFT specific warning. It lets you know that a randomly initialized classification head (`score`) is attached to the base model, and the head must be trained to produce sensible predictions. + +When you get this warning _before_ training the model, PEFT automatically takes care of making the classification head trainable if you correctly passed the `task_type` argument to the PEFT config. + +```python +from peft import LoraConfig, TaskType + +lora_config = LoraConfig(..., task_type=TaskType.SEQ_CLS) +``` + +If your classification head does not follow the usual naming conventions from Transformers (which is rare), you have to explicitly tell PEFT the name of the head in `modules_to_save`. + +```python +lora_config = LoraConfig(..., modules_to_save=["name-of-classification-head"]) +``` + +To check the name of the classification head, print the model and it should be the last module. + +If you get this warning from your inference code, i.e. _after_ training the model, when you load the PEFT model, you always have to load the Transformers model first. Since Transformers does not know that you will load PEFT weights afterwards, it still gives the warning. + +As always, it is best practice to ensure the model works correctly for inference by running some validation on it. + +### Check layer and model status + +Sometimes a PEFT model can end up in a bad state, especially when handling multiple adapters. There can be some confusion around what adapters exist, which one is active, which one is merged, etc. To help investigate this issue, call the [`~peft.PeftModel.get_layer_status`] and the [`~peft.PeftModel.get_model_status`] methods. + +The [`~peft.PeftModel.get_layer_status`] method gives you a detailed overview of each targeted layer's active, merged, and available adapters. + +```python +>>> from transformers import AutoModel +>>> from peft import get_peft_model, LoraConfig + +>>> model_id = "google/flan-t5-small" +>>> model = AutoModel.from_pretrained(model_id) +>>> model = get_peft_model(model, LoraConfig()) + +>>> model.get_layer_status() +[TunerLayerStatus(name='model.encoder.block.0.layer.0.SelfAttention.q', + module_type='lora.Linear', + enabled=True, + active_adapters=['default'], + merged_adapters=[], + requires_grad={'default': True}, + available_adapters=['default']), + TunerLayerStatus(name='model.encoder.block.0.layer.0.SelfAttention.v', + module_type='lora.Linear', + enabled=True, + active_adapters=['default'], + merged_adapters=[], + requires_grad={'default': True}, + available_adapters=['default']), +...] + +>>> model.get_model_status() +TunerModelStatus( + base_model_type='T5Model', + adapter_model_type='LoraModel', + peft_types={'default': 'LORA'}, + trainable_params=344064, + total_params=60855680, + num_adapter_layers=48, + enabled=True, + active_adapters=['default'], + merged_adapters=[], + requires_grad={'default': True}, + available_adapters=['default'], +) +``` + +In the model state output, you should look out for entries that say `"irregular"`. This means PEFT detected an inconsistent state in the model. For instance, if `merged_adapters="irregular"`, it means that for at least one adapter, it was merged on some target modules but not on others. The inference results will most likely be incorrect as a result. + +The best way to resolve this issue is to reload the whole model and adapter checkpoint(s). Ensure that you don't perform any incorrect operations on the model, e.g. manually merging adapters on some modules but not others. + +Convert the layer status into a pandas `DataFrame` for an easier visual inspection. + +```python +from dataclasses import asdict +import pandas as pd + +df = pd.DataFrame(asdict(layer) for layer in model.get_layer_status()) +``` + +It is possible to get this information for non-PEFT models if they are using PEFT layers under the hood, but some information like the `base_model_type` or the `peft_types` cannot be determined in that case. As an example, you can call this on a [diffusers](https://huggingface.co/docs/diffusers/index) model like so: + +```python +>>> import torch +>>> from diffusers import StableDiffusionPipeline +>>> from peft import get_model_status, get_layer_status + +>>> path = "runwayml/stable-diffusion-v1-5" +>>> lora_id = "takuma104/lora-test-text-encoder-lora-target" +>>> pipe = StableDiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16) +>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-1") +>>> pipe.load_lora_weights(lora_id, adapter_name="adapter-2") +>>> pipe.set_lora_device(["adapter-2"], "cuda") +>>> get_layer_status(pipe.text_encoder) +[TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.k_proj', + module_type='lora.Linear', + enabled=True, + active_adapters=['adapter-2'], + merged_adapters=[], + requires_grad={'adapter-1': False, 'adapter-2': True}, + available_adapters=['adapter-1', 'adapter-2'], + devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}), + TunerLayerStatus(name='text_model.encoder.layers.0.self_attn.v_proj', + module_type='lora.Linear', + enabled=True, + active_adapters=['adapter-2'], + merged_adapters=[], + requires_grad={'adapter-1': False, 'adapter-2': True}, + devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}), +...] + +>>> get_model_status(pipe.unet) +TunerModelStatus( + base_model_type='other', + adapter_model_type='None', + peft_types={}, + trainable_params=797184, + total_params=861115332, + num_adapter_layers=128, + enabled=True, + active_adapters=['adapter-2'], + merged_adapters=[], + requires_grad={'adapter-1': False, 'adapter-2': True}, + available_adapters=['adapter-1', 'adapter-2'], + devices={'adapter-1': ['cpu'], 'adapter-2': ['cuda']}, +) +``` + +## Speed + +### Loading adapter weights is slow + +Loading adapters like LoRA weights should generally be fast compared to loading the base model. However, there can be use cases where the adapter weights are quite large or where users need to load a large number of adapters -- the loading time can add up in this case. The reason for this is that the adapter weights are first initialized and then overridden by the loaded weights, which is wasteful. To speed up the loading time, you can pass the `low_cpu_mem_usage=True` argument to [`~PeftModel.from_pretrained`] and [`~PeftModel.load_adapter`]. + +> [!TIP] +> If this option works well across different use cases, it may become the default for adapter loading in the future. + + +## Reproducibility + +### Models using batch norm + +When loading a trained PEFT model where the base model uses batch norm (e.g. `torch.nn.BatchNorm1d` or `torch.nn.BatchNorm2d`), you may find that you cannot reproduce the exact same outputs. This is because the batch norm layers keep track of running stats during training, but these stats are not part of the PEFT checkpoint. Therefore, when you load the PEFT model, the running stats of the base model will be used (i.e. from before training with PEFT). + +Depending on your use case, this may not be a big deal. If, however, you need your outputs to be 100% reproducible, you can achieve this by adding the batch norm layers to `modules_to_save`. Below is an example of this using resnet and LoRA. Notice that we set `modules_to_save=["classifier", "normalization"]`. We need the `"classifier"` argument because our task is image classification, and we add the `"normalization"` argument to ensure that the batch norm layers are saved in the PEFT checkpoint. + +```python +from transformers import AutoModelForImageClassification +from peft import LoraConfig, get_peft_model + +model_id = "microsoft/resnet-18" +base_model = AutoModelForImageClassification.from_pretrained(self.model_id) +config = LoraConfig( + target_modules=["convolution"], + modules_to_save=["classifier", "normalization"], +), +``` + +Depending on the type of model you use, the batch norm layers could have different names than `"normalization"`, so please ensure that the name matches your model architecture. + +## Version mismatch + +### Error while loading the config because of an unexpected keyword argument + +When you encounter an error like the one shown below, it means the adapter you're trying to load was trained with a more recent version of PEFT than the version you have installed on your system. + +``` +TypeError: LoraConfig.__init__() got an unexpected keyword argument +``` + +The best way to resolve this issue is to install the latest PEFT version: + +```sh +python -m pip install -U PEFT +``` + +If the adapter was trained from a source install of PEFT (an unreleased version of PEFT), then you also need to install PEFT from source. + +```sh +python -m pip install -U git+https://github.com/huggingface/peft.git +``` + +If it is not possible for you to upgrade PEFT, there is a workaround you can try. + +Assume the error message says that the unknown keyword argument is named `foobar`. Search inside the `adapter_config.json` of this PEFT adapter for the `foobar` entry and delete it from the file. Then save the file and try loading the model again. + +This solution works most of the time. As long as it is the default value for `foobar`, it can be ignored. However, when it is set to some other value, you will get incorrect results. Upgrading PEFT is the recommended solution. + +## Adapter handling + +### Using multiple adapters at the same time + +PEFT allows you to create more than one adapter on the same model. This can be useful in many situations. For example, for inference, you may want to serve two fine-tuned models from the same base model instead of loading the base model once for each fine-tuned model, which would cost more memory. However, multiple adapters can be activated at the same time. This way, the model may leverage the learnings from all those adapters at the same time. As an example, if you have a diffusion model, you may want to use one LoRA adapter to change the style and a different one to change the subject. + +Activating multiple adapters at the same time is generally possible on all PEFT methods (LoRA, LoHa, IA³, etc.) except for prompt learning methods (p-tuning, prefix tuning, etc.). The following example illustrates how to achieve this: + +```python +from transformers import AutoModelForCausalLM +from peft import PeftModel + +model_id = ... +base_model = AutoModelForCausalLM.from_pretrained(model_id) +model = PeftModel.from_pretrained(base_model, lora_path_0) # default adapter_name is 'default' +model.load_adapter(lora_path_1, adapter_name="other") +# the 'other' adapter was loaded but it's not active yet, so to activate both adapters: +model.base_model.set_adapter(["default", "other"]) +``` + +> [!TIP] +> In the example above, you can see that we need to call `model.base_model.set_adapter(["default", "other"])`. Why can we not call `model.set_adapter(["default", "other"])`? This is unfortunately not possible because, as explained earlier, some PEFT methods don't support activating more than one adapter at a time. + +It is also possible to train two adapters at the same time, but you should be careful to ensure that the weights of both adapters are known to the optimizer. Otherwise, only one adapter will receive updates. + +```python +from transformers import AutoModelForCausalLM +from peft import LoraConfig, get_peft_model + +model_id = ... +base_model = AutoModelForCausalLM.from_pretrained(model_id) +lora_config_0 = LoraConfig(...) +lora_config_1 = LoraConfig(...) +model = get_peft_model(base_model, lora_config_0) +model.add_adapter(adapter_name="other", peft_config=lora_config_1) +``` + +If we would now call: + +```python +from transformers import Trainer + +trainer = Trainer(model=model, ...) +trainer.train() +``` + +or + +```python +optimizer = torch.optim.AdamW([param for param in model.parameters() if param.requires_grad], ...) +``` + +then the second LoRA adapter (`"other"`) would not be trained. This is because it is inactive at this moment, which means the `requires_grad` attribute on its parameters is set to `False` and the optimizer will ignore it. Therefore, make sure to activate all adapters that should be trained _before_ initializing the optimizer: + +```python +# activate all adapters +model.base_model.set_adapter(["default", "other"]) +trainer = Trainer(model=model, ...) +trainer.train() +``` + +> [!TIP] +> This section deals with using multiple adapters _of the same type_ on the same model, for example, using multiple LoRA adapters at the same time. It does not apply to using _different types_ of adapters on the same model, for example one LoRA adapter and one LoHa adapter. For this, please check [`PeftMixedModel`](https://huggingface.co/docs/peft/developer_guides/mixed_models). diff --git a/peft/docs/source/index.md b/peft/docs/source/index.md new file mode 100644 index 0000000000000000000000000000000000000000..d38544311f22cb8168e61d40216bd6e882837781 --- /dev/null +++ b/peft/docs/source/index.md @@ -0,0 +1,49 @@ + + +# PEFT + +🤗 PEFT (Parameter-Efficient Fine-Tuning) is a library for efficiently adapting large pretrained models to various downstream applications without fine-tuning all of a model's parameters because it is prohibitively costly. PEFT methods only fine-tune a small number of (extra) model parameters - significantly decreasing computational and storage costs - while yielding performance comparable to a fully fine-tuned model. This makes it more accessible to train and store large language models (LLMs) on consumer hardware. + +PEFT is integrated with the Transformers, Diffusers, and Accelerate libraries to provide a faster and easier way to load, train, and use large models for inference. + + + + diff --git a/peft/docs/source/install.md b/peft/docs/source/install.md new file mode 100644 index 0000000000000000000000000000000000000000..49279bfa873e41cc812dc7eaf636cc8751d14ceb --- /dev/null +++ b/peft/docs/source/install.md @@ -0,0 +1,47 @@ + + +# Installation + +Before you start, you will need to setup your environment, install the appropriate packages, and configure 🤗 PEFT. 🤗 PEFT is tested on **Python 3.9+**. + +🤗 PEFT is available on PyPI, as well as GitHub: + +## PyPI + +To install 🤗 PEFT from PyPI: + +```bash +pip install peft +``` + +## Source + +New features that haven't been released yet are added every day, which also means there may be some bugs. To try them out, install from the GitHub repository: + +```bash +pip install git+https://github.com/huggingface/peft +``` + +If you're working on contributing to the library or wish to play with the source code and see live +results as you run the code, an editable version can be installed from a locally-cloned version of the +repository: + +```bash +git clone https://github.com/huggingface/peft +cd peft +pip install -e .[test] +``` diff --git a/peft/docs/source/package_reference/adalora.md b/peft/docs/source/package_reference/adalora.md new file mode 100644 index 0000000000000000000000000000000000000000..9cc51d0e0911a0d7afdd5e486ce1fd96316dde04 --- /dev/null +++ b/peft/docs/source/package_reference/adalora.md @@ -0,0 +1,31 @@ + + +# AdaLoRA + +[AdaLoRA](https://hf.co/papers/2303.10512) is a method for optimizing the number of trainable parameters to assign to weight matrices and layers, unlike LoRA, which distributes parameters evenly across all modules. More parameters are budgeted for important weight matrices and layers while less important ones receive fewer parameters. + +The abstract from the paper is: + +*Fine-tuning large pre-trained language models on downstream tasks has become an important paradigm in NLP. However, common practice fine-tunes all of the parameters in a pre-trained model, which becomes prohibitive when a large number of downstream tasks are present. Therefore, many fine-tuning methods are proposed to learn incremental updates of pre-trained weights in a parameter efficient way, e.g., low-rank increments. These methods often evenly distribute the budget of incremental updates across all pre-trained weight matrices, and overlook the varying importance of different weight parameters. As a consequence, the fine-tuning performance is suboptimal. To bridge this gap, we propose AdaLoRA, which adaptively allocates the parameter budget among weight matrices according to their importance score. In particular, AdaLoRA parameterizes the incremental updates in the form of singular value decomposition. Such a novel approach allows us to effectively prune the singular values of unimportant updates, which is essentially to reduce their parameter budget but circumvent intensive exact SVD computations. We conduct extensive experiments with several pre-trained models on natural language processing, question answering, and natural language generation to validate the effectiveness of AdaLoRA. Results demonstrate that AdaLoRA manifests notable improvement over baselines, especially in the low budget settings. Our code is publicly available at https://github.com/QingruZhang/AdaLoRA*. + +## AdaLoraConfig + +[[autodoc]] tuners.adalora.config.AdaLoraConfig + +## AdaLoraModel + +[[autodoc]] tuners.adalora.model.AdaLoraModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/adapter_utils.md b/peft/docs/source/package_reference/adapter_utils.md new file mode 100644 index 0000000000000000000000000000000000000000..8f8b4e6c7f713dc9152864def4b9d94f6b5dd40f --- /dev/null +++ b/peft/docs/source/package_reference/adapter_utils.md @@ -0,0 +1,31 @@ + + +# LyCORIS + +[LyCORIS](https://hf.co/papers/2309.14859) (Lora beYond Conventional methods, Other Rank adaptation Implementations for Stable diffusion) are LoRA-like matrix decomposition adapters that modify the cross-attention layer of the UNet. The [LoHa](loha) and [LoKr](lokr) methods inherit from the `Lycoris` classes here. + +## LycorisConfig + +[[autodoc]] tuners.lycoris_utils.LycorisConfig + +## LycorisLayer + +[[autodoc]] tuners.lycoris_utils.LycorisLayer + +## LycorisTuner + +[[autodoc]] tuners.lycoris_utils.LycorisTuner \ No newline at end of file diff --git a/peft/docs/source/package_reference/auto_class.md b/peft/docs/source/package_reference/auto_class.md new file mode 100644 index 0000000000000000000000000000000000000000..c1b78a2c34235844c73a6c5590bd53951c1b09d2 --- /dev/null +++ b/peft/docs/source/package_reference/auto_class.md @@ -0,0 +1,48 @@ + + +# AutoPeftModels + +The `AutoPeftModel` classes loads the appropriate PEFT model for the task type by automatically inferring it from the configuration file. They are designed to quickly and easily load a PEFT model in a single line of code without having to worry about which exact model class you need or manually loading a [`PeftConfig`]. + +## AutoPeftModel + +[[autodoc]] auto.AutoPeftModel + - from_pretrained + +## AutoPeftModelForCausalLM + +[[autodoc]] auto.AutoPeftModelForCausalLM + +## AutoPeftModelForSeq2SeqLM + +[[autodoc]] auto.AutoPeftModelForSeq2SeqLM + +## AutoPeftModelForSequenceClassification + +[[autodoc]] auto.AutoPeftModelForSequenceClassification + +## AutoPeftModelForTokenClassification + +[[autodoc]] auto.AutoPeftModelForTokenClassification + +## AutoPeftModelForQuestionAnswering + +[[autodoc]] auto.AutoPeftModelForQuestionAnswering + +## AutoPeftModelForFeatureExtraction + +[[autodoc]] auto.AutoPeftModelForFeatureExtraction diff --git a/peft/docs/source/package_reference/boft.md b/peft/docs/source/package_reference/boft.md new file mode 100644 index 0000000000000000000000000000000000000000..48231fa9fdc16699295b48357cb6d837e26e4c84 --- /dev/null +++ b/peft/docs/source/package_reference/boft.md @@ -0,0 +1,31 @@ + + +# BOFT + +[Orthogonal Butterfly (BOFT)](https://hf.co/papers/2311.06243) is a generic method designed for finetuning foundation models. It improves the parameter efficiency of the finetuning paradigm -- Orthogonal Finetuning (OFT), by taking inspiration from Cooley-Tukey fast Fourier transform, showing favorable results across finetuning different foundation models, including large vision transformers, large language models and text-to-image diffusion models. + +The abstract from the paper is: + +*Large foundation models are becoming ubiquitous, but training them from scratch is prohibitively expensive. Thus, efficiently adapting these powerful models to downstream tasks is increasingly important. In this paper, we study a principled finetuning paradigm -- Orthogonal Finetuning (OFT) -- for downstream task adaptation. Despite demonstrating good generalizability, OFT still uses a fairly large number of trainable parameters due to the high dimensionality of orthogonal matrices. To address this, we start by examining OFT from an information transmission perspective, and then identify a few key desiderata that enable better parameter-efficiency. Inspired by how the Cooley-Tukey fast Fourier transform algorithm enables efficient information transmission, we propose an efficient orthogonal parameterization using butterfly structures. We apply this parameterization to OFT, creating a novel parameter-efficient finetuning method, called Orthogonal Butterfly (BOFT). By subsuming OFT as a special case, BOFT introduces a generalized orthogonal finetuning framework. Finally, we conduct an extensive empirical study of adapting large vision transformers, large language models, and text-to-image diffusion models to various downstream tasks in vision and language*. + +## BOFTConfig + +[[autodoc]] tuners.boft.config.BOFTConfig + +## BOFTModel + +[[autodoc]] tuners.boft.model.BOFTModel diff --git a/peft/docs/source/package_reference/bone.md b/peft/docs/source/package_reference/bone.md new file mode 100644 index 0000000000000000000000000000000000000000..c8bad5d1aae2e9f7c8bc479a958957ce70e6b236 --- /dev/null +++ b/peft/docs/source/package_reference/bone.md @@ -0,0 +1,33 @@ + + +# Bone + +DiSHA: Dimension-Sharding Adaptation ([DiSHA](https://huggingface.co/papers/2409.15371)) We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Building on DiSHA, we propose an efficient algorithm called Block-Affine Adaptation (Bone) structure and a non-linear update method called Block Affine Transformation Adaptation (BAT). + + +The abstract from the paper is: + +Low-Rank Adaptation (LoRA) leverages the low intrinsic rank of weight updates in Large Language Models (LLMs), establishing a Parameter-Efficient Fine-Tuning (PEFT) paradigm. However, LoRA suffers from slow convergence. We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Within DiSHA's design space, we propose Block Affine Adaptation (Bone), a computationally efficient structure that delivers both high performance and efficiency. While certain DiSHA configurations may result in colinear updates to weight shards, we address this with Block Affine Transformation Adaptation (BAT), a nonlinear variant of DiSHA. BAT introduces nonlinearity by combining trainable matrices with original weight shards in a nonlinear manner, inducing nonlinearity in matrix updates without introducing additional parameters. Empirical results show that Bone, under the DiSHA framework, consistently outperforms LoRA variants in both NLG and NLU tasks, with significantly improved computational efficiency. Further analysis demonstrates that BAT enhances model capabilities by leveraging its nonlinear design. + + +## BoneConfig + +[[autodoc]] tuners.bone.config.BoneConfig + +## BoneModel + +[[autodoc]] tuners.bone.model.BoneModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/c3a.md b/peft/docs/source/package_reference/c3a.md new file mode 100644 index 0000000000000000000000000000000000000000..ff8d511af6942aabfb7abc3c424af4aeb176c2d4 --- /dev/null +++ b/peft/docs/source/package_reference/c3a.md @@ -0,0 +1,43 @@ + + +# C3A: Parameter-Efficient Fine-Tuning via Circular Convolution + +[C3A](https://huggingface.co/papers/2407.19342) is a parameter-efficient fine-tuning technique that leverages Circular Convolution to achieve high rank adaptation within reasonable resource limits. + +Note that you should use a much larger learning rate (LR) for C3A than for other methods. For example, a LR of 1e-1 for C3A is a good starting point. Besides, a much smaller weight decay should be used. You can refer to the `method_comparison` folder for more details. + +For the `block_size`, it affects tunable parameters and performance. To start with, you can choose a $\mathrm{gcd}(d_1,d_2)$ near $\frac{\sqrt{d_1\times d_2}}{r}$, where $r$ is the rank for LoRA you would use for this task. + +C3A currently has the following constraints: + +- Only `nn.Linear` layers are supported. +- Quantized layers are not supported. +- The block size should be a common divisor of both the input and output sizes of target layers. + +If these constraints don't work for your use case, consider other methods instead. + +The abstract from the paper is: + +> Low-Rank Adaptation (LoRA) has gained popularity for fine-tuning large foundation models, leveraging low-rank matrices $\mathbf{A}$ and $\mathbf{B}$ to represent weight changes (i.e., $\Delta \mathbf{W} = \mathbf{B} \mathbf{A}$). This method reduces trainable parameters and mitigates heavy memory consumption associated with full delta matrices by sequentially multiplying $\mathbf{A}$ and $\mathbf{B}$ with the activation. Despite its success, the intrinsic low-rank characteristic may limit its performance. Although several variants have been proposed to address this issue, they often overlook the crucial computational and memory efficiency brought by LoRA. In this paper, we propose Circular Convolution Adaptation (C3A), which not only achieves high-rank adaptation with enhanced performance but also excels in both computational power and memory utilization. Extensive experiments demonstrate that C3A consistently outperforms LoRA and its variants across various fine-tuning tasks. + +## C3AConfig + +[[autodoc]] tuners.c3a.config.C3AConfig + +## C3AModel + +[[autodoc]] tuners.c3a.model.C3AModel diff --git a/peft/docs/source/package_reference/config.md b/peft/docs/source/package_reference/config.md new file mode 100644 index 0000000000000000000000000000000000000000..9a4f755e1cd1af7c7871df11cd89100944e355cb --- /dev/null +++ b/peft/docs/source/package_reference/config.md @@ -0,0 +1,22 @@ + + +# Configuration + +[`PeftConfigMixin`] is the base configuration class for storing the adapter configuration of a [`PeftModel`], and [`PromptLearningConfig`] is the base configuration class for soft prompt methods (p-tuning, prefix tuning, and prompt tuning). These base classes contain methods for saving and loading model configurations from the Hub, specifying the PEFT method to use, type of task to perform, and model configurations like number of layers and number of attention heads. + +## PeftConfigMixin + +[[autodoc]] config.PeftConfigMixin + - all + +## PeftConfig + +[[autodoc]] PeftConfig + - all + +## PromptLearningConfig + +[[autodoc]] PromptLearningConfig + - all diff --git a/peft/docs/source/package_reference/cpt.md b/peft/docs/source/package_reference/cpt.md new file mode 100644 index 0000000000000000000000000000000000000000..9e67fd7c37b7c344cb209fde834b268148720de7 --- /dev/null +++ b/peft/docs/source/package_reference/cpt.md @@ -0,0 +1,34 @@ + + +# Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods + +[CPT](https://huggingface.co/papers/2410.17222) combines In-Context Learning (ICL), Prompt Tuning (PT), and adversarial optimization to improve few-shot learning by refining context embeddings. CPT updates the context tokens by optimizing both the context and the training examples, encapsulating them into a novel loss design that minimizes overfitting, enables more effective optimization, and drives significant improvements in classification tasks. + +[//]: # ([CPT](https://huggingface.co/papers/2410.17222) for the paper) + +The abstract from the paper is: + +> Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning. + + +Take a look at [Example](https://github.com/huggingface/peft/blob/main/examples/cpt_finetuning/README.md) for a step-by-step guide on how to train a model with CPT. + + +## CPTConfig + +[[autodoc]] tuners.cpt.config.CPTConfig + +## CPTEmbedding + +[[autodoc]] tuners.cpt.model.CPTEmbedding + diff --git a/peft/docs/source/package_reference/fourierft.md b/peft/docs/source/package_reference/fourierft.md new file mode 100644 index 0000000000000000000000000000000000000000..1d298a9042be45e188ae05fa6b51f97c44be0567 --- /dev/null +++ b/peft/docs/source/package_reference/fourierft.md @@ -0,0 +1,38 @@ + + +# FourierFT: Discrete Fourier Transformation Fine-Tuning + +[FourierFT](https://huggingface.co/papers/2405.03003) is a parameter-efficient fine-tuning technique that leverages Discrete Fourier Transform to compress the model's tunable weights. This method outperforms LoRA in the GLUE benchmark and common ViT classification tasks using much less parameters. + +FourierFT currently has the following constraints: + +- Only `nn.Linear` layers are supported. +- Quantized layers are not supported. + +If these constraints don't work for your use case, consider other methods instead. + +The abstract from the paper is: + +> Low-rank adaptation (LoRA) has recently gained much interest in fine-tuning foundation models. It effectively reduces the number of trainable parameters by incorporating low-rank matrices A and B to represent the weight change, i.e., Delta W=BA. Despite LoRA's progress, it faces storage challenges when handling extensive customization adaptations or larger base models. In this work, we aim to further compress trainable parameters by enjoying the powerful expressiveness of the Fourier transform. Specifically, we introduce FourierFT, which treats Delta W as a matrix in the spatial domain and learns only a small fraction of its spectral coefficients. With the trained spectral coefficients, we implement the inverse discrete Fourier transform to recover Delta W. Empirically, our FourierFT method shows comparable or better performance with fewer parameters than LoRA on various tasks, including natural language understanding, natural language generation, instruction tuning, and image classification. For example, when performing instruction tuning on the LLaMA2-7B model, FourierFT surpasses LoRA with only 0.064M trainable parameters, compared to LoRA's 33.5M. + +## FourierFTConfig + +[[autodoc]] tuners.fourierft.config.FourierFTConfig + +## FourierFTModel + +[[autodoc]] tuners.fourierft.model.FourierFTModel diff --git a/peft/docs/source/package_reference/functional.md b/peft/docs/source/package_reference/functional.md new file mode 100644 index 0000000000000000000000000000000000000000..52251bd4905a3412ef12ad499c106ef9e75ec117 --- /dev/null +++ b/peft/docs/source/package_reference/functional.md @@ -0,0 +1,37 @@ + + +# Functions for PEFT integration + +A collection of functions that could be useful for non-PeftModel models, e.g. transformers or diffusers integration + +The functions provided here can be considered "public API" of PEFT and hence are safe to be used by packages that provide PEFT integrations. + +## Cast the adapter weight dtypes +[[autodoc]] functional.cast_adapter_dtype + - all + +## Delete the PEFT adapter from model +[[autodoc]] functional.delete_adapter + - all + +## Get the state dict of the PEFT adapter +[[autodoc]] functional.get_peft_model_state_dict + - all + +## Inject a PEFT adapter into the model based on a PEFT config +[[autodoc]] functional.inject_adapter_in_model + - all + +## Set the active PEFT adapter(s) of the model +[[autodoc]] functional.set_adapter + - all + +## Set the `requires_grad` attribute of the specified adapters +[[autodoc]] functional.set_requires_grad + - all + +## Load the weights of the PEFT state dict into the model +[[autodoc]] functional.set_peft_model_state_dict + - all diff --git a/peft/docs/source/package_reference/helpers.md b/peft/docs/source/package_reference/helpers.md new file mode 100644 index 0000000000000000000000000000000000000000..83e129d6ea0477c0ff511de8331f07686cf67f45 --- /dev/null +++ b/peft/docs/source/package_reference/helpers.md @@ -0,0 +1,22 @@ + + +# Helper methods + +A collection of helper functions for PEFT. + +## Checking if a model is a PEFT model + +[[autodoc]] helpers.check_if_peft_model + - all + +## Temporarily Rescaling Adapter Scale in LoraLayer Modules + +[[autodoc]] helpers.rescale_adapter_scale + - all + +## Context manager to disable input dtype casting in the `forward` method of LoRA layers + +[[autodoc]] helpers.disable_input_dtype_casting + - all diff --git a/peft/docs/source/package_reference/hotswap.md b/peft/docs/source/package_reference/hotswap.md new file mode 100644 index 0000000000000000000000000000000000000000..9bae07bbfb31633b9f2e3627896346ad99fd42ed --- /dev/null +++ b/peft/docs/source/package_reference/hotswap.md @@ -0,0 +1,76 @@ + + +# Hotswapping adapters + +The idea of hotswapping an adapter is the following: We can already load multiple adapters, e.g. two LoRAs, at the same time. But sometimes, we want to load one LoRA and then replace its weights in-place with the LoRA weights of another adapter. This is now possible the `hotswap_adapter` function. + +In general, this should be faster than deleting one adapter and loading the adapter in its place, which would be the how to achieve the same final outcome without hotswapping. Another advantage of hotswapping is that it prevents re-compilation in case the PEFT model is already compiled using `torch.compile`. This can save quite a lot of time. + +## Example without `torch.compile` + +```python +import torch +from transformers import AutoModelForCausalLM +from peft import PeftModel +from peft.utils.hotswap import hotswap_adapter + +model_id = ... +inputs = ... +device = ... +model = AutoModelForCausalLM.from_pretrained(model_id).to(device) + +# load lora 0 +model = PeftModel.from_pretrained(model, ) +with torch.inference_mode(): + output_adapter_0 = model(inputs) + +# replace the "default" lora adapter with the new one +hotswap_adapter(model, , adapter_name="default", torch_device=device) +with torch.inference_mode(): + output_adapter_1 = model(inputs).logits +``` + +## Example with `torch.compile` + +```python +import torch +from transformers import AutoModelForCausalLM +from peft import PeftModel +from peft.utils.hotswap import hotswap_adapter, prepare_model_for_compiled_hotswap + +model_id = ... +inputs = ... +device = ... +max_rank = ... # maximum rank among all LoRA adapters that will be used +model = AutoModelForCausalLM.from_pretrained(model_id).to(device) + +# load lora 0 +model = PeftModel.from_pretrained(model, ) +# Prepare the model to allow hotswapping even if ranks/scalings of 2nd adapter differ. +# You can skip this step if all ranks and scalings are identical. +prepare_model_for_compiled_hotswap(model, target_rank=max_rank) +model = torch.compile(model) +with torch.inference_mode(): + output_adapter_0 = model(inputs) + +# replace the "default" lora adapter with the new one +hotswap_adapter(model, , adapter_name="default", torch_device=device) +with torch.inference_mode(): + output_adapter_1 = model(inputs).logits +``` + +## Caveats + +Hotswapping works with transformers models and diffusers models. However, there are some caveats: + +- Right now, only LoRA is properly supported. +- It only works for the same PEFT method, so no swapping LoRA and LoHa, for example. +- The adapter that is being swapped in must target the same layers as the previous adapter or a subset of those layers. It cannot target new layers. Therefore, if possible, start with the adapter that targets most layers. + +[[autodoc]] utils.hotswap.hotswap_adapter + - all + +[[autodoc]] utils.hotswap.hotswap_adapter_from_state_dict + - all diff --git a/peft/docs/source/package_reference/hra.md b/peft/docs/source/package_reference/hra.md new file mode 100644 index 0000000000000000000000000000000000000000..fa499069b47d84904f61c00db555cee906578764 --- /dev/null +++ b/peft/docs/source/package_reference/hra.md @@ -0,0 +1,32 @@ + + +# Bridging The Gap between Low-rank and Orthogonal Adaptation via Householder Reflection Adaptation (HRA) + +[HRA](https://huggingface.co/papers/2405.17484) is a simple but effective adapter-based fine-tuning method by leveraging Householder reflections. This method harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge. It consistently achieves better performance with fewer trainable parameters and outperforms state-of-the-art adapters across different models, including large language models (LLMs) and conditional image generators. + + +The abstract from the paper is: + +> While following different technical routes, both low-rank and orthogonal adaptation techniques can efficiently adapt large-scale pre-training models in specific tasks or domains based on a small piece of trainable parameters. In this study, we bridge the gap between these two techniques, proposing a simple but effective adaptation method based on Householder reflections. Given a pre-trained model, our method fine-tunes its layers by multiplying each frozen weight matrix with an orthogonal matrix constructed by a chain of learnable Householder reflections (HRs). This HR-based orthogonal fine-tuning is equivalent to an adaptive low-rank adaptation. Moreover, we show that the orthogonality of the reflection planes corresponding to the HRs impacts the model capacity and regularity. The analysis motivates us to regularize the orthogonality of the HRs, leading to different implementations of the proposed Householder reflection adaptation (HRA) method. Compared with state-of-the-art methods, HRA achieves superior performance with fewer learnable parameters when adapting large language models and conditional image generators. The code is available at [peft](https://github.com/huggingface/peft/tree/main/src/peft/tuners/hra) and [HRA](https://github.com/DaShenZi721/HRA). + +## HRAConfig + +[[autodoc]] tuners.hra.config.HRAConfig + +## HRAModel + +[[autodoc]] tuners.hra.model.HRAModel diff --git a/peft/docs/source/package_reference/ia3.md b/peft/docs/source/package_reference/ia3.md new file mode 100644 index 0000000000000000000000000000000000000000..3885fd9c6029923477d7ca07d5650f2dcb1abbb8 --- /dev/null +++ b/peft/docs/source/package_reference/ia3.md @@ -0,0 +1,31 @@ + + +# IA3 + +Infused Adapter by Inhibiting and Amplifying Inner Activations, or [IA3](https://hf.co/papers/2205.05638), is a method that adds three learned vectors to rescale the keys and values of the self-attention and encoder-decoder attention layers, and the intermediate activation of the position-wise feed-forward network. + +The abstract from the paper is: + +*Few-shot in-context learning (ICL) enables pre-trained language models to perform a previously-unseen task without any gradient-based training by feeding a small number of training examples as part of the input. ICL incurs substantial computational, memory, and storage costs because it involves processing all of the training examples every time a prediction is made. Parameter-efficient fine-tuning (PEFT) (e.g. adapter modules, prompt tuning, sparse update methods, etc.) offers an alternative paradigm where a small set of parameters are trained to enable a model to perform the new task. In this paper, we rigorously compare few-shot ICL and PEFT and demonstrate that the latter offers better accuracy as well as dramatically lower computational costs. Along the way, we introduce a new PEFT method called (IA)^3 that scales activations by learned vectors, attaining stronger performance while only introducing a relatively tiny amount of new parameters. We also propose a simple recipe based on the T0 model called T-Few that can be applied to new tasks without task-specific tuning or modifications. We validate the effectiveness of T-Few on completely unseen tasks by applying it to the RAFT benchmark, attaining super-human performance for the first time and outperforming the state-of-the-art by 6% absolute. All of the code used in our experiments is publicly available*. + +## IA3Config + +[[autodoc]] tuners.ia3.config.IA3Config + +## IA3Model + +[[autodoc]] tuners.ia3.model.IA3Model \ No newline at end of file diff --git a/peft/docs/source/package_reference/layernorm_tuning.md b/peft/docs/source/package_reference/layernorm_tuning.md new file mode 100644 index 0000000000000000000000000000000000000000..fc78870d3495b051a782ee3ba2f0e6f06a74e261 --- /dev/null +++ b/peft/docs/source/package_reference/layernorm_tuning.md @@ -0,0 +1,34 @@ + + +# LayerNorm Tuning + +LayerNorm Tuning ([LN Tuning](https://huggingface.co/papers/2312.11420)) is a PEFT method that only fine-tunes the parameters of the LayerNorm layers in a model. +The paper has tested the performance of this method on large language models and has shown that it can achieve strong performance with a significant reduction in the number of trainable parameters and GPU memory usage. +However, the method is not limited to language models and can be applied to any model that uses LayerNorm layers. +In this implementation, the default is that all layernorm layers inside a model is finetuned, but it could be used to target other layer types such as `MLP` or `Attention` layers, this can be done by specifying the `target_modules` in the `LNTuningConfig`. + +The abstract from the paper is: + +*This paper introduces an efficient strategy to transform Large Language Models (LLMs) into Multi-Modal Large Language Models (MLLMs). By conceptualizing this transformation as a domain adaptation process, i.e., transitioning from text understanding to embracing multiple modalities, we intriguingly note that, within each attention block, tuning LayerNorm suffices to yield strong performance. Moreover, when benchmarked against other tuning approaches like full parameter finetuning or LoRA, its benefits on efficiency are substantial. For example, when compared to LoRA on a 13B model scale, performance can be enhanced by an average of over 20% across five multi-modal tasks, and meanwhile, results in a significant reduction of trainable parameters by 41.9% and a decrease in GPU memory usage by 17.6%. On top of this LayerNorm strategy, we showcase that selectively tuning only with conversational data can improve efficiency further. Beyond these empirical outcomes, we provide a comprehensive analysis to explore the role of LayerNorm in adapting LLMs to the multi-modal domain and improving the expressive power of the model.* + +## LNTuningConfig + +[[autodoc]] tuners.ln_tuning.config.LNTuningConfig + +## LNTuningModel + +[[autodoc]] tuners.ln_tuning.model.LNTuningModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/llama_adapter.md b/peft/docs/source/package_reference/llama_adapter.md new file mode 100644 index 0000000000000000000000000000000000000000..52e6c537b200ab39c9300688eb90384645ead64a --- /dev/null +++ b/peft/docs/source/package_reference/llama_adapter.md @@ -0,0 +1,31 @@ + + +# Llama-Adapter + +[Llama-Adapter](https://hf.co/papers/2303.16199) is a PEFT method specifically designed for turning Llama into an instruction-following model. The Llama model is frozen and only a set of adaptation prompts prefixed to the input instruction tokens are learned. Since randomly initialized modules inserted into the model can cause the model to lose some of its existing knowledge, Llama-Adapter uses zero-initialized attention with zero gating to progressively add the instructional prompts to the model. + +The abstract from the paper is: + +*We present LLaMA-Adapter, a lightweight adaption method to efficiently fine-tune LLaMA into an instruction-following model. Using 52K self-instruct demonstrations, LLaMA-Adapter only introduces 1.2M learnable parameters upon the frozen LLaMA 7B model, and costs less than one hour for fine-tuning on 8 A100 GPUs. Specifically, we adopt a set of learnable adaption prompts, and prepend them to the input text tokens at higher transformer layers. Then, a zero-init attention mechanism with zero gating is proposed, which adaptively injects the new instructional cues into LLaMA, while effectively preserves its pre-trained knowledge. With efficient training, LLaMA-Adapter generates high-quality responses, comparable to Alpaca with fully fine-tuned 7B parameters. Furthermore, our approach can be simply extended to multi-modal input, e.g., images, for image-conditioned LLaMA, which achieves superior reasoning capacity on ScienceQA. We release our code at https://github.com/ZrrSkywalker/LLaMA-Adapter*. + +## AdaptionPromptConfig + +[[autodoc]] tuners.adaption_prompt.config.AdaptionPromptConfig + +## AdaptionPromptModel + +[[autodoc]] tuners.adaption_prompt.model.AdaptionPromptModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/loha.md b/peft/docs/source/package_reference/loha.md new file mode 100644 index 0000000000000000000000000000000000000000..b4ca21ee14e3da2dcc76e700a5347bd70bd55e07 --- /dev/null +++ b/peft/docs/source/package_reference/loha.md @@ -0,0 +1,31 @@ + + +# LoHa + +Low-Rank Hadamard Product ([LoHa](https://huggingface.co/papers/2108.06098)), is similar to LoRA except it approximates the large weight matrix with more low-rank matrices and combines them with the Hadamard product. This method is even more parameter-efficient than LoRA and achieves comparable performance. + +The abstract from the paper is: + +*In this work, we propose a communication-efficient parameterization, FedPara, for federated learning (FL) to overcome the burdens on frequent model uploads and downloads. Our method re-parameterizes weight parameters of layers using low-rank weights followed by the Hadamard product. Compared to the conventional low-rank parameterization, our FedPara method is not restricted to low-rank constraints, and thereby it has a far larger capacity. This property enables to achieve comparable performance while requiring 3 to 10 times lower communication costs than the model with the original layers, which is not achievable by the traditional low-rank methods. The efficiency of our method can be further improved by combining with other efficient FL optimizers. In addition, we extend our method to a personalized FL application, pFedPara, which separates parameters into global and local ones. We show that pFedPara outperforms competing personalized FL methods with more than three times fewer parameters*. + +## LoHaConfig + +[[autodoc]] tuners.loha.config.LoHaConfig + +## LoHaModel + +[[autodoc]] tuners.loha.model.LoHaModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/lokr.md b/peft/docs/source/package_reference/lokr.md new file mode 100644 index 0000000000000000000000000000000000000000..5be43f8546766e80f8cca0493cd346c0233f99c4 --- /dev/null +++ b/peft/docs/source/package_reference/lokr.md @@ -0,0 +1,27 @@ + + +# LoKr + +Low-Rank Kronecker Product ([LoKr](https://hf.co/papers/2309.14859)), is a LoRA-variant method that approximates the large weight matrix with two low-rank matrices and combines them with the Kronecker product. LoKr also provides an optional third low-rank matrix to provide better control during fine-tuning. + +## LoKrConfig + +[[autodoc]] tuners.lokr.config.LoKrConfig + +## LoKrModel + +[[autodoc]] tuners.lokr.model.LoKrModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/lora.md b/peft/docs/source/package_reference/lora.md new file mode 100644 index 0000000000000000000000000000000000000000..3c2a77f9762c2c819965bd70011a985e75dcd993 --- /dev/null +++ b/peft/docs/source/package_reference/lora.md @@ -0,0 +1,55 @@ + + +# LoRA + +Low-Rank Adaptation ([LoRA](https://huggingface.co/papers/2309.15223)) is a PEFT method that decomposes a large matrix into two smaller low-rank matrices in the attention layers. This drastically reduces the number of parameters that need to be fine-tuned. + +The abstract from the paper is: + +*We propose a neural language modeling system based on low-rank adaptation (LoRA) for speech recognition output rescoring. Although pretrained language models (LMs) like BERT have shown superior performance in second-pass rescoring, the high computational cost of scaling up the pretraining stage and adapting the pretrained models to specific domains limit their practical use in rescoring. Here we present a method based on low-rank decomposition to train a rescoring BERT model and adapt it to new domains using only a fraction (0.08%) of the pretrained parameters. These inserted matrices are optimized through a discriminative training objective along with a correlation-based regularization loss. The proposed low-rank adaptation Rescore-BERT (LoRB) architecture is evaluated on LibriSpeech and internal datasets with decreased training times by factors between 5.4 and 3.6.*. + +## LoraConfig + +[[autodoc]] tuners.lora.config.LoraConfig + +## LoraModel + +[[autodoc]] tuners.lora.model.LoraModel + +## Utility + +### ArrowConfig + +[[autodoc]] tuners.lora.config.ArrowConfig + +### LoftQ + +[[autodoc]] utils.loftq_utils.replace_lora_weights_loftq + +### Eva + +#### EvaConfig + +[[autodoc]] tuners.lora.config.EvaConfig + +#### initialize_lora_eva_weights + +[[autodoc]] tuners.lora.eva.initialize_lora_eva_weights + +#### get_eva_state_dict + +[[autodoc]] tuners.lora.eva.get_eva_state_dict diff --git a/peft/docs/source/package_reference/merge_utils.md b/peft/docs/source/package_reference/merge_utils.md new file mode 100644 index 0000000000000000000000000000000000000000..e5746127dc5cb61fe9e0c804b0956ff7e3792106 --- /dev/null +++ b/peft/docs/source/package_reference/merge_utils.md @@ -0,0 +1,33 @@ + + +# Model merge + +PEFT provides several internal utilities for [merging LoRA adapters](../developer_guides/model_merging) with the TIES and DARE methods. + +[[autodoc]] utils.merge_utils.prune + +[[autodoc]] utils.merge_utils.calculate_majority_sign_mask + +[[autodoc]] utils.merge_utils.disjoint_merge + +[[autodoc]] utils.merge_utils.task_arithmetic + +[[autodoc]] utils.merge_utils.ties + +[[autodoc]] utils.merge_utils.dare_linear + +[[autodoc]] utils.merge_utils.dare_ties diff --git a/peft/docs/source/package_reference/miss.md b/peft/docs/source/package_reference/miss.md new file mode 100644 index 0000000000000000000000000000000000000000..8226a4acd251fd2f4092aaaa6b254fbd746bb0b3 --- /dev/null +++ b/peft/docs/source/package_reference/miss.md @@ -0,0 +1,32 @@ + + +# MiSS + +MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing([MiSS](https://huggingface.co/papers/2409.15371)) is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency. + +The abstract from the paper is: + +*Parameter-Efficient Fine-Tuning (PEFT) methods, particularly Low-Rank Adaptation (LoRA), effectively reduce the number of trainable parameters in Large Language Models (LLMs). However, as model scales continue to grow, the demand for computational resources remains a significant challenge. Existing LoRA variants often struggle to strike an optimal balance between adaptability (model performance and convergence speed) and efficiency (computational overhead, memory usage, and initialization time). This paper introduces MiSS(Matrix Shard Sharing ), a novel PEFT approach that addresses this trade-off through a simple shard-sharing mechanism. MiSS leverages the insight that a low-rank adaptation can be achieved by decomposing the weight matrix into multiple fragment matrices and utilizing a shared, trainable common fragment. This method constructs the low-rank update matrix through the replication of these shared, partitioned shards. We also propose a hardware-efficient and broadly applicable implementation for MiSS. Extensive experiments conducted on a range of tasks, alongside a systematic analysis of computational performance, demonstrate MiSS's superiority. The results show that MiSS significantly outperforms standard LoRA and its prominent variants in both model performance metrics and computational efficiency, including initialization speed and training throughput. By effectively balancing expressive power and resource utilization, MiSS offers a compelling solution for efficiently adapting large-scale models*. + + +## MissConfig + +[[autodoc]] tuners.miss.config.MissConfig + +## MissModel + +[[autodoc]] tuners.miss.model.MissModel \ No newline at end of file diff --git a/peft/docs/source/package_reference/multitask_prompt_tuning.md b/peft/docs/source/package_reference/multitask_prompt_tuning.md new file mode 100644 index 0000000000000000000000000000000000000000..119739a3dc8fb89461cd826f3696109eb7e40734 --- /dev/null +++ b/peft/docs/source/package_reference/multitask_prompt_tuning.md @@ -0,0 +1,31 @@ + + +# Multitask prompt tuning + +[Multitask prompt tuning](https://huggingface.co/papers/2303.02861) decomposes the soft prompts of each task into a single learned transferable prompt instead of a separate prompt for each task. The single learned prompt can be adapted for each task by multiplicative low rank updates. + +The abstract from the paper is: + +*Prompt tuning, in which a base pretrained model is adapted to each task via conditioning on learned prompt vectors, has emerged as a promising approach for efficiently adapting large language models to multiple downstream tasks. However, existing methods typically learn soft prompt vectors from scratch, and it has not been clear how to exploit the rich cross-task knowledge with prompt vectors in a multitask learning setting. We propose multitask prompt tuning (MPT), which first learns a single transferable prompt by distilling knowledge from multiple task-specific source prompts. We then learn multiplicative low rank updates to this shared prompt to efficiently adapt it to each downstream target task. Extensive experiments on 23 NLP datasets demonstrate that our proposed approach outperforms the state-of-the-art methods, including the full finetuning baseline in some cases, despite only tuning 0.035% as many task-specific parameters*. + +## MultitaskPromptTuningConfig + +[[autodoc]] tuners.multitask_prompt_tuning.config.MultitaskPromptTuningConfig + +## MultitaskPromptEmbedding + +[[autodoc]] tuners.multitask_prompt_tuning.model.MultitaskPromptEmbedding \ No newline at end of file diff --git a/peft/docs/source/package_reference/oft.md b/peft/docs/source/package_reference/oft.md new file mode 100644 index 0000000000000000000000000000000000000000..63909b202bfc47bcfd9b1038ea13e978e41202f5 --- /dev/null +++ b/peft/docs/source/package_reference/oft.md @@ -0,0 +1,31 @@ + + +# OFT + +[Orthogonal Finetuning (OFT)](https://hf.co/papers/2306.07280) is a method developed for adapting text-to-image diffusion models. It works by reparameterizing the pretrained weight matrices with its orthogonal matrix to preserve information in the pretrained model. To reduce the number of parameters, OFT introduces a block-diagonal structure in the orthogonal matrix. + +The abstract from the paper is: + +*Large text-to-image diffusion models have impressive capabilities in generating photorealistic images from text prompts. How to effectively guide or control these powerful models to perform different downstream tasks becomes an important open problem. To tackle this challenge, we introduce a principled finetuning method -- Orthogonal Finetuning (OFT), for adapting text-to-image diffusion models to downstream tasks. Unlike existing methods, OFT can provably preserve hyperspherical energy which characterizes the pairwise neuron relationship on the unit hypersphere. We find that this property is crucial for preserving the semantic generation ability of text-to-image diffusion models. To improve finetuning stability, we further propose Constrained Orthogonal Finetuning (COFT) which imposes an additional radius constraint to the hypersphere. Specifically, we consider two important finetuning text-to-image tasks: subject-driven generation where the goal is to generate subject-specific images given a few images of a subject and a text prompt, and controllable generation where the goal is to enable the model to take in additional control signals. We empirically show that our OFT framework outperforms existing methods in generation quality and convergence speed*. + +## OFTConfig + +[[autodoc]] tuners.oft.config.OFTConfig + +## OFTModel + +[[autodoc]] tuners.oft.model.OFTModel diff --git a/peft/docs/source/package_reference/p_tuning.md b/peft/docs/source/package_reference/p_tuning.md new file mode 100644 index 0000000000000000000000000000000000000000..a35f7244c34b7b9ccbec33acb1dce59b361c1ab9 --- /dev/null +++ b/peft/docs/source/package_reference/p_tuning.md @@ -0,0 +1,31 @@ + + +# P-tuning + +[P-tuning](https://hf.co/papers/2103.10385) adds trainable prompt embeddings to the input that is optimized by a prompt encoder to find a better prompt, eliminating the need to manually design prompts. The prompt tokens can be added anywhere in the input sequence, and p-tuning also introduces anchor tokens for improving performance. + +The abstract from the paper is: + +*While GPTs with traditional fine-tuning fail to achieve strong results on natural language understanding (NLU), we show that GPTs can be better than or comparable to similar-sized BERTs on NLU tasks with a novel method P-tuning -- which employs trainable continuous prompt embeddings. On the knowledge probing (LAMA) benchmark, the best GPT recovers 64\% (P@1) of world knowledge without any additional text provided during test time, which substantially improves the previous best by 20+ percentage points. On the SuperGlue benchmark, GPTs achieve comparable and sometimes better performance to similar-sized BERTs in supervised learning. Importantly, we find that P-tuning also improves BERTs' performance in both few-shot and supervised settings while largely reducing the need for prompt engineering. Consequently, P-tuning outperforms the state-of-the-art approaches on the few-shot SuperGlue benchmark.*. + +## PromptEncoderConfig + +[[autodoc]] tuners.p_tuning.config.PromptEncoderConfig + +## PromptEncoder + +[[autodoc]] tuners.p_tuning.model.PromptEncoder \ No newline at end of file diff --git a/peft/docs/source/package_reference/peft_model.md b/peft/docs/source/package_reference/peft_model.md new file mode 100644 index 0000000000000000000000000000000000000000..366ef91fd8bf09e7103d6595d2f09ca16b0b7fd9 --- /dev/null +++ b/peft/docs/source/package_reference/peft_model.md @@ -0,0 +1,77 @@ + + +# Models + +[`PeftModel`] is the base model class for specifying the base Transformer model and configuration to apply a PEFT method to. The base `PeftModel` contains methods for loading and saving models from the Hub. + +## PeftModel + +[[autodoc]] PeftModel + - all + +## PeftModelForSequenceClassification + +A `PeftModel` for sequence classification tasks. + +[[autodoc]] PeftModelForSequenceClassification + - all + +## PeftModelForTokenClassification + +A `PeftModel` for token classification tasks. + +[[autodoc]] PeftModelForTokenClassification + - all + +## PeftModelForCausalLM + +A `PeftModel` for causal language modeling. + +[[autodoc]] PeftModelForCausalLM + - all + +## PeftModelForSeq2SeqLM + +A `PeftModel` for sequence-to-sequence language modeling. + +[[autodoc]] PeftModelForSeq2SeqLM + - all + +## PeftModelForQuestionAnswering + +A `PeftModel` for question answering. + +[[autodoc]] PeftModelForQuestionAnswering + - all + +## PeftModelForFeatureExtraction + +A `PeftModel` for getting extracting features/embeddings from transformer models. + +[[autodoc]] PeftModelForFeatureExtraction + - all + +## PeftMixedModel + +A `PeftModel` for mixing different adapter types (e.g. LoRA and LoHa). + +[[autodoc]] PeftMixedModel + - all + +## Utilities + +[[autodoc]] utils.cast_mixed_precision_params + +[[autodoc]] get_peft_model + +[[autodoc]] inject_adapter_in_model + +[[autodoc]] utils.get_peft_model_state_dict + +[[autodoc]] utils.prepare_model_for_kbit_training + +[[autodoc]] get_layer_status + +[[autodoc]] get_model_status diff --git a/peft/docs/source/package_reference/peft_types.md b/peft/docs/source/package_reference/peft_types.md new file mode 100644 index 0000000000000000000000000000000000000000..55edbbd21a47ff035dc39dc90e55e6e524df6493 --- /dev/null +++ b/peft/docs/source/package_reference/peft_types.md @@ -0,0 +1,27 @@ + + +# PEFT types + +[`PeftType`] includes the supported adapters in PEFT, and [`TaskType`] includes PEFT-supported tasks. + +## PeftType + +[[autodoc]] utils.peft_types.PeftType + +## TaskType + +[[autodoc]] utils.peft_types.TaskType \ No newline at end of file diff --git a/peft/docs/source/package_reference/poly.md b/peft/docs/source/package_reference/poly.md new file mode 100644 index 0000000000000000000000000000000000000000..a4cf28ce560c207685259ac13f2be1a5f2dbce06 --- /dev/null +++ b/peft/docs/source/package_reference/poly.md @@ -0,0 +1,44 @@ + + +# Polytropon + +[Polytropon](https://hf.co/papers/2202.13914) is a multitask model with a number of different LoRA adapters in its "inventory". The model learns the correct combination of adapters from the inventory with a routing function to choose the best subset of modules for a specific task. PEFT also supports [Multi-Head Adapter Routing (MHR)](https://hf.co/papers/2211.03831) for Polytropon which builds on and improves the routing function by combining the adapter heads more granularly. The adapter heads are separated into disjoint blocks and a different routing function is learned for each one, allowing for more expressivity. + + + + +The abstract from the paper is: + +*A modular design encourages neural models to disentangle and recombine different facets of knowledge to generalise more systematically to new tasks. In this work, we assume that each task is associated with a subset of latent discrete skills from a (potentially small) inventory. In turn, skills correspond to parameter-efficient (sparse / low-rank) model parameterisations. By jointly learning these and a task-skill allocation matrix, the network for each task is instantiated as the average of the parameters of active skills. To favour non-trivial soft partitions of skills across tasks, we experiment with a series of inductive biases, such as an Indian Buffet Process prior and a two-speed learning rate. We evaluate our latent-skill model on two main settings: 1) multitask reinforcement learning for grounded instruction following on 8 levels of the BabyAI platform; and 2) few-shot adaptation of pre-trained text-to-text generative models on CrossFit, a benchmark comprising 160 NLP tasks. We find that the modular design of a network significantly increases sample efficiency in reinforcement learning and few-shot generalisation in supervised learning, compared to baselines with fully shared, task-specific, or conditionally generated parameters where knowledge is entangled across tasks. In addition, we show how discrete skills help interpretability, as they yield an explicit hierarchy of tasks.* + + + + +The abstract from the paper is: + +*Parameter-efficient fine-tuning (PEFT) for cross-task generalization consists in pre-training adapters on a multi-task training set before few-shot adaptation to test tasks. Polytropon [Ponti et al., 2023] (Poly) jointly learns an inventory of adapters and a routing function that selects a (variable-size) subset of adapters for each task during both pre-training and few-shot adaptation. In this paper, we investigate the role that adapter routing plays in its success and design new variants based on our findings. First, we build on the intuition that finer-grained routing provides more expressivity. Hence, we propose MHR (Multi-Head Routing), which combines subsets of adapter parameters and outperforms Poly under a comparable parameter budget; by only fine-tuning the routing function and not the adapters (MHR-z), we achieve competitive performance with extreme parameter efficiency. Second, we find that Poly/MHR performance is a result of better multi-task optimization, rather than modular inductive biases that facilitate adapter recombination and local adaptation, as previously hypothesized. In fact, we find that MHR exhibits higher gradient alignment between tasks than any other method. Since this implies that routing is only crucial during multi-task pre-training, we propose MHR-mu, which discards routing and fine-tunes the average of the pre-trained adapters during few-shot adaptation. This establishes MHR-mu as an effective method for single-adapter fine-tuning.*. + + + + +## PolyConfig + +[[autodoc]] tuners.poly.config.PolyConfig + +## PolyModel + +[[autodoc]] tuners.poly.model.PolyModel diff --git a/peft/docs/source/package_reference/prefix_tuning.md b/peft/docs/source/package_reference/prefix_tuning.md new file mode 100644 index 0000000000000000000000000000000000000000..62df037bb0a2f2053a65f39f36179e5a42732207 --- /dev/null +++ b/peft/docs/source/package_reference/prefix_tuning.md @@ -0,0 +1,31 @@ + + +# Prefix tuning + +[Prefix tuning](https://hf.co/papers/2101.00190) prefixes a series of task-specific vectors to the input sequence that can be learned while keeping the pretrained model frozen. The prefix parameters are inserted in all of the model layers. + +The abstract from the paper is: + +*Fine-tuning is the de facto way to leverage large pretrained language models to perform downstream tasks. However, it modifies all the language model parameters and therefore necessitates storing a full copy for each task. In this paper, we propose prefix-tuning, a lightweight alternative to fine-tuning for natural language generation tasks, which keeps language model parameters frozen, but optimizes a small continuous task-specific vector (called the prefix). Prefix-tuning draws inspiration from prompting, allowing subsequent tokens to attend to this prefix as if it were "virtual tokens". We apply prefix-tuning to GPT-2 for table-to-text generation and to BART for summarization. We find that by learning only 0.1\% of the parameters, prefix-tuning obtains comparable performance in the full data setting, outperforms fine-tuning in low-data settings, and extrapolates better to examples with topics unseen during training*. + +## PrefixTuningConfig + +[[autodoc]] tuners.prefix_tuning.config.PrefixTuningConfig + +## PrefixEncoder + +[[autodoc]] tuners.prefix_tuning.model.PrefixEncoder \ No newline at end of file diff --git a/peft/docs/source/package_reference/prompt_tuning.md b/peft/docs/source/package_reference/prompt_tuning.md new file mode 100644 index 0000000000000000000000000000000000000000..61dbb6a2e934d2ec9931de7a44b298f20df208dd --- /dev/null +++ b/peft/docs/source/package_reference/prompt_tuning.md @@ -0,0 +1,31 @@ + + +# Prompt tuning + +[Prompt tuning](https://hf.co/papers/2104.08691) adds task-specific prompts to the input, and these prompt parameters are updated independently of the pretrained model parameters which are frozen. + +The abstract from the paper is: + +*In this work, we explore "prompt tuning", a simple yet effective mechanism for learning "soft prompts" to condition frozen language models to perform specific downstream tasks. Unlike the discrete text prompts used by GPT-3, soft prompts are learned through backpropagation and can be tuned to incorporate signal from any number of labeled examples. Our end-to-end learned approach outperforms GPT-3's "few-shot" learning by a large margin. More remarkably, through ablations on model size using T5, we show that prompt tuning becomes more competitive with scale: as models exceed billions of parameters, our method "closes the gap" and matches the strong performance of model tuning (where all model weights are tuned). This finding is especially relevant in that large models are costly to share and serve, and the ability to reuse one frozen model for multiple downstream tasks can ease this burden. Our method can be seen as a simplification of the recently proposed "prefix tuning" of Li and Liang (2021), and we provide a comparison to this and other similar approaches. Finally, we show that conditioning a frozen model with soft prompts confers benefits in robustness to domain transfer, as compared to full model tuning*. + +## PromptTuningConfig + +[[autodoc]] tuners.prompt_tuning.config.PromptTuningConfig + +## PromptEmbedding + +[[autodoc]] tuners.prompt_tuning.model.PromptEmbedding \ No newline at end of file diff --git a/peft/docs/source/package_reference/randlora.md b/peft/docs/source/package_reference/randlora.md new file mode 100644 index 0000000000000000000000000000000000000000..07837defc91aceea2b446a9f7d332fd01a1d1aff --- /dev/null +++ b/peft/docs/source/package_reference/randlora.md @@ -0,0 +1,45 @@ + + +# RandLora: Full-rank parameter-efficient fine-tuning of large models +[RandLora](https://huggingface.co/papers/2502.00987) is a parameter-efficient fine-tuning technique that is similar to [LoRA](https://huggingface.co/papers/2106.09685) and [VeRA](https://huggingface.co/papers/2310.11454) but performs full rank updates to improve performance. RandLora can be particulary usefull when adapting large model to hard tasks that require complex updates while preserving the parameter efficiency of LoRA. The full rank update of RandLora is achieved by linearly scaling random bases. The random bases are a collection of multiple low rank matrices such that the summation of their ranks if greater or equal to the full rank of the parameter matrices. The trainable parameters of RandLora are two diagonal matrices (vectors) that get multiplied with the right hand low rank random bases, in a similar way to VeRA's update. To maintain low memory usage, RandLora uses a custom function that prevents storing unnecessary bases in memory for backpropagation. + +RandLora presents the noteworthy difference that contrary to other LoRA-like PEFT algorithm, increasing RandLora's random base ranks increases the amount of trainable parameters. Because number of bases x bases rank is constant in RandLora, reducing the rank will increase the number of random bases, hence the number of base-specific trainable diagonal bases. + +Because reducing the rank of RandLora's random bases will increase their number, RandLora can become slower to train than LoRA for very small ranks where typically, ranks below 4 with result in a large training time increase. This does not affect inference though as the RandLora adapters can be merged into the pretrained weight matrices. + +RandLora additionally supports training with sparse, ternary random bases (only containing -1, 0 and 1). These bases are as described in [Bingham et al.](https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf) and [Ping et al.](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf) and could theoretically be used to reduce compute needs by performing aggregations instead of matrix multiplications to create the weight update. This is not currently supported. Although it does not currently reduce compute, using sparse random bases in RandLora can reduce overfitting in some cases. For users intersted in using sparse ternary bases, the `sparse` option is recommended over the `very_sparse` one that can reduce perfromance. + +Similarly to VeRA, when saving the RandLora's parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default). + +As in Vera and to handle different shapes of adapted layers, RandLora initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted. + +RandLora currently has the following constraint: + +- Only `nn.Linear` layers are supported. + +The abstract from the paper is: + +> Low-Rank Adaptation (LoRA) and its variants have shown impressive results in reducing the number of trainable parameters and memory requirements of large transformer networks while maintaining fine-tuning performance. The low-rank nature of the weight update inherently limits the representation power of fine-tuned models, however, thus potentially compromising performance on complex tasks. This raises a critical question: when a performance gap between LoRA and standard fine-tuning is observed, is it due to the reduced number of trainable parameters or the rank deficiency? +This paper aims to answer this question by introducing RandLora, a parameter-efficient method that performs full-rank updates using a learned linear combinations of low-rank, non-trainable random matrices. Our method limits the number of trainable parameters by restricting optimization to diagonal scaling matrices applied to the fixed random matrices. This allows us to effectively overcome the low-rank limitations while maintaining parameter and memory efficiency during training. Through extensive experimentation across vision, language, and vision-language benchmarks, we systematically evaluate the limitations of LoRA and existing random basis methods. Our findings reveal that full-rank updates are beneficial across vision and language tasks individually, and even more so for vision-language tasks, where RandLora significantly reduces---and sometimes eliminates---the performance gap between standard fine-tuning and LoRA, demonstrating its efficacy. + +## RandLoraConfig + +[[autodoc]] tuners.randlora.config.RandLoraConfig + +## RandLoraModel + +[[autodoc]] tuners.randlora.model.RandLoraModel diff --git a/peft/docs/source/package_reference/road.md b/peft/docs/source/package_reference/road.md new file mode 100644 index 0000000000000000000000000000000000000000..52b9514f9733bd97be43a1d8a161705cfc91685c --- /dev/null +++ b/peft/docs/source/package_reference/road.md @@ -0,0 +1,31 @@ + + +# RoAd + +[RoAd](https://arxiv.org/pdf/2409.00119) is a parameter‑efficient fine‑tuning technique that adapts large language models by learning a small set of 2×2 rotation matrices (and optional scaling factors) applied to pairs of hidden dimensions. RoAd achieves competitive or superior performance compared to other PEFT methods with under 0.1% trainable parameters. Unlike LoRA’s batched low‑rank updates, RoAd’s sparse rotations reformulate to simple element‑wise operations, yielding significantly higher serving throughput when handling heterogeneous requests in the same batch, i.e. serving multiple adapters simulatenously. Moreover, RoAd integrates seamlessly into a distributed interchange intervention framework, interpreting its sparse 2D rotations as task-specific interventions within learned subspaces of hidden representations. These orthogonal subspaces can be composed to merge multiple task-specific behaviors—like multilingual capabilities or instruction following—without additional fine-tuning, enabling modular, interpretable adaptations in LLMs. + +Finetuning with RoAd typically requires higher learning rate compared to LoRA or similar methods, around 1e-3. Currently RoAd only supports linear layers and it can be used on models quantized with bitsandbytes (4-bit or 8-bit). + +For running inference with different RoAd adapters in the same batch see [Inference with different LoRA adapters in the same batch](../developer_guides/lora#inference-with-different-lora-adapters-in-the-same-batch). + +## RoadConfig + +[[autodoc]] tuners.road.config.RoadConfig + +## RoadModel + +[[autodoc]] tuners.road.model.RoadModel diff --git a/peft/docs/source/package_reference/shira.md b/peft/docs/source/package_reference/shira.md new file mode 100644 index 0000000000000000000000000000000000000000..1bb21e184050ac5397b3c205533db39708e40b1c --- /dev/null +++ b/peft/docs/source/package_reference/shira.md @@ -0,0 +1,35 @@ + + +# Sparse High Rank Adapters + +Sparse High Rank Adapters or [SHiRA](https://arxiv.org/abs/2406.13175) is an alternate type of adapter and has been found to have significant advantages over the low rank adapters. Specifically, SHiRA achieves better accuracy than LoRA for a variety of vision and language tasks. It also offers simpler and higher quality multi-adapter fusion by significantly reducing concept loss, a common problem faced by low rank adapters. SHiRA directly finetunes a small number of the base model's parameters to finetune the model on any adaptation task. + +SHiRA currently has the following constraint: + +- Only `nn.Linear` layers are supported. + +The abstract from the paper is: + +> Low Rank Adaptation (LoRA) has gained massive attention in the recent generative AI research. One of the main advantages of LoRA is its ability to be fused with pretrained models, adding no overhead during inference. However, from a mobile deployment standpoint, we can either avoid inference overhead in the fused mode but lose the ability to switch adapters rapidly, or suffer significant (up to 30% higher) inference latency while enabling rapid switching in the unfused mode. LoRA also exhibits concept-loss when multiple adapters are used concurrently. In this paper, we propose Sparse High Rank Adapters (SHiRA), a new paradigm which incurs no inference overhead, enables rapid switching, and significantly reduces concept-loss. Specifically, SHiRA can be trained by directly tuning only 1-2% of the base model weights while leaving others unchanged. This results in a highly sparse adapter which can be switched directly in the fused mode. We further provide theoretical and empirical insights on how high sparsity in SHiRA can aid multi-adapter fusion by reducing concept loss. Our extensive experiments on LVMs and LLMs demonstrate that finetuning only a small fraction of the parameters in the base model significantly outperforms LoRA while enabling both rapid switching and multi-adapter fusion. Finally, we provide a latency- and memory-efficient SHiRA implementation based on Parameter-Efficient Finetuning (PEFT) Library which trains at nearly the same speed as LoRA while consuming up to 16% lower peak GPU memory, thus making SHiRA easy to adopt for practical use cases. To demonstrate rapid switching benefits during inference, we show that loading SHiRA on a base model can be 5x-16x faster than LoRA fusion on a CPU. + +## ShiraConfig + +[[autodoc]] tuners.shira.config.ShiraConfig + +## ShiraModel + +[[autodoc]] tuners.shira.model.ShiraModel diff --git a/peft/docs/source/package_reference/trainable_tokens.md b/peft/docs/source/package_reference/trainable_tokens.md new file mode 100644 index 0000000000000000000000000000000000000000..adebde7357215b4d53a21fc99110550bbfbb9301 --- /dev/null +++ b/peft/docs/source/package_reference/trainable_tokens.md @@ -0,0 +1,50 @@ + + +# Trainable Tokens + +The Trainable Tokens method provides a way to target specific token embeddings for fine-tuning without resorting to +training the full embedding matrix or using an adapter on the embedding matrix. It is based on the initial implementation from +[here](https://github.com/huggingface/peft/pull/1541). + +The method only targets specific tokens and selectively trains the token indices you specify. Consequently the +required RAM will be lower and disk memory is also significantly lower than storing the full fine-tuned embedding matrix. + +Some preliminary benchmarks acquired with [this script](https://github.com/huggingface/peft/blob/main/scripts/train_memory.py) +suggest that for `gemma-2-2b` (which has a rather large embedding matrix) you can save ~4 GiB VRAM with Trainable Tokens +over fully fine-tuning the embedding matrix. While LoRA will use comparable amounts of VRAM it might also target +tokens you don't want to be changed. Note that these are just indications and varying embedding matrix sizes might skew +these numbers a bit. + +Note that this method does not add tokens for you, you have to add tokens to the tokenizer yourself and resize the +embedding matrix of the model accordingly. This method will only re-train the embeddings for the tokens you specify. +This method can also be used in conjunction with LoRA layers! See [the LoRA developer guide](../developer_guides/lora#efficiently-train-tokens-alongside-lora). + +> [!TIP] +> Saving the model with [`~PeftModel.save_pretrained`] or retrieving the state dict using +> [`get_peft_model_state_dict`] when adding new tokens may save the full embedding matrix instead of only the difference +> as a precaution because the embedding matrix was resized. To save space you can disable this behavior by setting +> `save_embedding_layers=False` when calling `save_pretrained`. This is safe to do as long as you don't modify the +> embedding matrix through other means as well, as such changes will be not tracked by trainable tokens. + +## TrainableTokensConfig + +[[autodoc]] tuners.trainable_tokens.config.TrainableTokensConfig + +## TrainableTokensModel + +[[autodoc]] tuners.trainable_tokens.model.TrainableTokensModel + diff --git a/peft/docs/source/package_reference/tuners.md b/peft/docs/source/package_reference/tuners.md new file mode 100644 index 0000000000000000000000000000000000000000..ae0594624713def2c67bee0aad7b27a969051ed0 --- /dev/null +++ b/peft/docs/source/package_reference/tuners.md @@ -0,0 +1,27 @@ + + +# Tuners + +A tuner (or adapter) is a module that can be plugged into a `torch.nn.Module`. [`BaseTuner`] base class for other tuners and provides shared methods and attributes for preparing an adapter configuration and replacing a target module with the adapter module. [`BaseTunerLayer`] is a base class for adapter layers. It offers methods and attributes for managing adapters such as activating and disabling adapters. + +## BaseTuner + +[[autodoc]] tuners.tuners_utils.BaseTuner + +## BaseTunerLayer + +[[autodoc]] tuners.tuners_utils.BaseTunerLayer \ No newline at end of file diff --git a/peft/docs/source/package_reference/vblora.md b/peft/docs/source/package_reference/vblora.md new file mode 100644 index 0000000000000000000000000000000000000000..02aaf10b8705d8ee565c705f67d23fd4d46d9eb7 --- /dev/null +++ b/peft/docs/source/package_reference/vblora.md @@ -0,0 +1,40 @@ + + +# VB-LoRA: Extreme Parameter Efficient Fine-Tuning with Vector Banks + +## Overview + +[VB-LoRA](https://huggingface.co/papers/2405.15179) is a parameter-efficient fine-tuning technique that extends LoRA by learning a fine-grained parameter-sharing scheme at the sub-vector level, achieving significantly higher parameter efficiency. This makes VB-LoRA especially useful in scenarios where storage and transmission costs are critical. It works by decomposing low-rank matrices—from different layers and modules such as K, Q, V, and FFN—into sub-vectors, which are then globally shared through a vector bank. + +The abstract from the paper is: + +*As the adoption of large language models increases and the need for per-user or per-task model customization grows, the parameter-efficient fine-tuning (PEFT) methods, such as low-rank adaptation (LoRA) and its variants, incur substantial storage and transmission costs. To further reduce stored parameters, we introduce a "divide-and-share" paradigm that breaks the barriers of low-rank decomposition across matrix dimensions, modules and layers by sharing parameters globally via a vector bank. As an instantiation of the paradigm to LoRA, our proposed VB-LoRA composites all the low-rank matrices of LoRA from a shared vector bank with a differentiable top-k admixture module. VB-LoRA achieves extreme parameter efficiency while maintaining comparable or better performance compared to state-of-the-art PEFT methods. Extensive experiments demonstrate the effectiveness of VB-LoRA on natural language understanding, natural language generation, and instruction tuning tasks. When fine-tuning the Llama2-13B model, VB-LoRA only uses 0.4% of LoRA's stored parameters, yet achieves superior results.* + +## Usage Tips + +- VB-LoRA utilizes a sparse top-k module to learn the sharing machanism. When saving adapter parameters, you can either save only the top-k weights and their indices by setting `save_only_topk_weights = True` in `VBLoRAConfig`, or save all the trainable logits by setting it to `False`. Enabling `save_only_topk_weights = True` significantly reduces storage space; for instance, in Llama2-7B, the storage file size decreases from 308MB to 2.5MB. Note that models saved with `save_only_topk_weights = True` are intended for merging or inference only and cannot be used to resume training. + +- VB-LoRA has two sets of training parameters: vector bank parameters and logit parameters. In practice, we found that logit parameters require a higher learning rate, while vector bank parameters require a lower learning rate. When using the AdamW optimizer, typical learning rates are 0.01 for logits and 0.001 for vector bank parameters. + +## VBLoRAConfig + +[[autodoc]] tuners.vblora.config.VBLoRAConfig + +## VBLoRAModel + +[[autodoc]] tuners.vblora.model.VBLoRAModel + diff --git a/peft/docs/source/package_reference/vera.md b/peft/docs/source/package_reference/vera.md new file mode 100644 index 0000000000000000000000000000000000000000..f9ed281275d31f706abd536a7de83937efa537b8 --- /dev/null +++ b/peft/docs/source/package_reference/vera.md @@ -0,0 +1,39 @@ + + +# VeRA: Vector-based Random Matrix Adaptation + +[VeRA](https://huggingface.co/papers/2310.11454) is a parameter-efficient fine-tuning technique that is similar to LoRA but requires even fewer extra parameters while promising similar or even better performance. As such, it is particularly useful when the parameter budget is very limited, e.g. when scaling to very large models. The reduction of the count of trainable parameters is achieved by sharing the same low-rank matrices across all layers, and only training two additional vectors per layer. + +When saving the adapter parameters, it's possible to eschew storing the low rank matrices by setting `save_projection=False` on the `VeraConfig`. In that case, these matrices will be restored based on the fixed random seed from the `projection_prng_key` argument. This cuts down on the size of the checkpoint, but we cannot guarantee reproducibility on all devices and for all future versions of PyTorch. If you want to ensure reproducibility, set `save_projection=True` (which is the default). + +To handle different shapes of adapted layers, VeRA initializes shared A and B matrices with the largest required size for each dimension. During the forward pass, submatrices A and B for a given layer are sliced out from these shared matrices and used as described in the paper. For example, adapting two linear layers of shapes (100, 20) and (80, 50) will create A and B matrices of shapes (rank, 50) and (100, rank) respectively. Then, to adapt a layer of shape (100, 20), submatrices A and B of shapes (rank, 20) and (100, rank) will be extracted. + +VeRA currently has the following constraint: + +- Only `nn.Linear` layers are supported. + +The abstract from the paper is: + +> Low-rank adapation (LoRA) is a popular method that reduces the number of trainable parameters when finetuning large language models, but still faces acute storage challenges when scaling to even larger models or deploying numerous per-user or per-task adapted models. In this work, we present Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the number of trainable parameters compared to LoRA, yet maintains the same performance. It achieves this by using a single pair of low-rank matrices shared across all layers and learning small scaling vectors instead. We demonstrate its effectiveness on the GLUE and E2E benchmarks, image classification tasks, and show its application in instruction-tuning of 7B and 13B language models. + +## VeRAConfig + +[[autodoc]] tuners.vera.config.VeraConfig + +## VeRAModel + +[[autodoc]] tuners.vera.model.VeraModel diff --git a/peft/docs/source/package_reference/waveft.md b/peft/docs/source/package_reference/waveft.md new file mode 100644 index 0000000000000000000000000000000000000000..ffb2987cd9d19f5d55d05d709709a46fb37119ae --- /dev/null +++ b/peft/docs/source/package_reference/waveft.md @@ -0,0 +1,35 @@ + + +# WaveFT: Wavelet Fine-Tuning + +[WaveFT](https://arxiv.org/abs/2505.12532) is a novel parameter-efficient fine-tuning (PEFT) method that introduces sparse updates in the **wavelet domain** of residual matrices. Unlike LoRA, which is constrained by discrete low-rank choices, WaveFT enables fine-grained control over the number of trainable parameters by directly learning a sparse set of coefficients in the transformed space. These coefficients are then mapped back to the weight domain via the Inverse Discrete Wavelet Transform (IDWT), producing high-rank updates without incurring inference overhead. + +WaveFT currently has the following constraint: + +- Only `nn.Linear` layers are supported. + +The abstract from the paper is: + +>Efficiently adapting large foundation models is critical, especially with tight compute and memory budgets. Parameter-Efficient Fine-Tuning (PEFT) methods such as LoRA offer limited granularity and effectiveness in few-parameter regimes. We propose Wavelet Fine-Tuning (WaveFT), a novel PEFT method that learns highly sparse updates in the wavelet domain of residual matrices. WaveFT allows precise control of trainable parameters, offering fine-grained capacity adjustment and excelling with remarkably low parameter count, potentially far fewer than LoRA’s minimum—ideal for extreme parameter-efficient scenarios. Evaluated on personalized text-to-image generation using Stable Diffusion XL as baseline, WaveFT significantly outperforms LoRA and other PEFT methods, especially at low parameter counts; achieving superior subject fidelity, prompt alignment, and image diversity. + +## WaveFTConfig + +[[autodoc]] tuners.waveft.config.WaveFTConfig + +## WaveFTModel + +[[autodoc]] tuners.waveft.model.WaveFTModel diff --git a/peft/docs/source/package_reference/xlora.md b/peft/docs/source/package_reference/xlora.md new file mode 100644 index 0000000000000000000000000000000000000000..f4710ab6fab41b7e70b15d7cac5efe6dce89fd3a --- /dev/null +++ b/peft/docs/source/package_reference/xlora.md @@ -0,0 +1,56 @@ + + +# X-LoRA + +Mixture of LoRA Experts ([X-LoRA](https://huggingface.co/papers/2402.07148)) is a PEFT method enabling sparse or dense mixture of LoRA experts based on a high granularity (token, layer, sequence) scalings matrix. This leverages frozen LoRA adapters and a frozen base model to drastically reduces the number of parameters that need to be fine-tuned. + +A unique aspect of X-LoRA is its versatility: it can be applied to any `transformers` base model with LoRA adapters. This means that, despite the mixture of experts strategy, no changes to the model code must be made. + +The below graphic demonstrates how the scalings change for different prompts for each token. This highlights the activation of different adapters as the generation progresses and the sequence creates new context. + +![Token-by-token scalings](https://github.com/EricLBuehler/xlora/raw/master/res/token_by_token_scalings.gif) + +The abstract from the paper is: + +*We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model (LLM) without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics and design. The impact of this work include access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, as well as molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties, but also reasons over the results and correctly predicts likely mechanisms that explain distinct molecular behaviors.*. + +Please cite X-LoRA as: +```bibtex +@article{10.1063/5.0203126, + author = {Buehler, Eric L. and Buehler, Markus J.}, + title = "{X-LoRA: Mixture of low-rank adapter experts, a flexible framework for large language models with applications in protein mechanics and molecular design}", + journal = {APL Machine Learning}, + volume = {2}, + number = {2}, + pages = {026119}, + year = {2024}, + month = {05}, + abstract = "{We report a mixture of expert strategy to create fine-tuned large language models using a deep layer-wise token-level approach based on low-rank adaptation (LoRA). Starting with a set of pre-trained LoRA adapters, our gating strategy uses the hidden states to dynamically mix adapted layers, allowing the resulting X-LoRA model to draw upon different capabilities and create never-before-used deep layer-wise combinations to solve tasks. The design is inspired by the biological principles of universality and diversity, where neural network building blocks are reused in different hierarchical manifestations. Hence, the X-LoRA model can be easily implemented for any existing large language model without a need for modifications of the underlying structure. We develop a tailored X-LoRA model that offers scientific capabilities, including forward/inverse analysis tasks and enhanced reasoning capability, focused on biomaterial analysis, protein mechanics, and design. The impact of this work includes access to readily expandable and adaptable models with strong domain knowledge and the capability to integrate across areas of knowledge. Featuring experts in biology, mathematics, reasoning, bio-inspired materials, mechanics and materials, chemistry, protein biophysics, mechanics, and quantum-mechanics based molecular properties, we conduct a series of physics-focused case studies. We examine knowledge recall, protein mechanics forward/inverse tasks, protein design, adversarial agentic modeling including ontological knowledge graph construction, and molecular design. The model is capable not only of making quantitative predictions of nanomechanical properties of proteins or quantum mechanical molecular properties but also reasoning over the results and correctly predicting likely mechanisms that explain distinct molecular behaviors.}", + issn = {2770-9019}, + doi = {10.1063/5.0203126}, + url = {https://doi.org/10.1063/5.0203126}, + eprint = {https://pubs.aip.org/aip/aml/article-pdf/doi/10.1063/5.0203126/19964043/026119\_1\_5.0203126.pdf}, +} +``` + +## XLoraConfig + +[[autodoc]] tuners.xlora.config.XLoraConfig + +## XLoraModel + +[[autodoc]] tuners.xlora.model.XLoraModel diff --git a/peft/docs/source/quicktour.md b/peft/docs/source/quicktour.md new file mode 100644 index 0000000000000000000000000000000000000000..1f0a0a27beeafa8795f23a5230f516c2eef20a7c --- /dev/null +++ b/peft/docs/source/quicktour.md @@ -0,0 +1,164 @@ + + +# Quicktour + +PEFT offers parameter-efficient methods for finetuning large pretrained models. The traditional paradigm is to finetune all of a model's parameters for each downstream task, but this is becoming exceedingly costly and impractical because of the enormous number of parameters in models today. Instead, it is more efficient to train a smaller number of prompt parameters or use a reparametrization method like low-rank adaptation (LoRA) to reduce the number of trainable parameters. + +This quicktour will show you PEFT's main features and how you can train or run inference on large models that would typically be inaccessible on consumer devices. + +## Train + +Each PEFT method is defined by a [`PeftConfig`] class that stores all the important parameters for building a [`PeftModel`]. For example, to train with LoRA, load and create a [`LoraConfig`] class and specify the following parameters: + +- `task_type`: the task to train for (sequence-to-sequence language modeling in this case) +- `inference_mode`: whether you're using the model for inference or not +- `r`: the dimension of the low-rank matrices +- `lora_alpha`: the scaling factor for the low-rank matrices +- `lora_dropout`: the dropout probability of the LoRA layers + +```python +from peft import LoraConfig, TaskType + +peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) +``` + +> [!TIP] +> See the [`LoraConfig`] reference for more details about other parameters you can adjust, such as the modules to target or the bias type. + +Once the [`LoraConfig`] is setup, create a [`PeftModel`] with the [`get_peft_model`] function. It takes a base model - which you can load from the Transformers library - and the [`LoraConfig`] containing the parameters for how to configure a model for training with LoRA. + +Load the base model you want to finetune. + +```python +from transformers import AutoModelForSeq2SeqLM + +model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") +``` + +Wrap the base model and `peft_config` with the [`get_peft_model`] function to create a [`PeftModel`]. To get a sense of the number of trainable parameters in your model, use the [`print_trainable_parameters`] method. + +```python +from peft import get_peft_model + +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"output: trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282" +``` + +Out of [bigscience/mt0-large's](https://huggingface.co/bigscience/mt0-large) 1.2B parameters, you're only training 0.19% of them! + +That is it 🎉! Now you can train the model with the Transformers [`~transformers.Trainer`], Accelerate, or any custom PyTorch training loop. + +For example, to train with the [`~transformers.Trainer`] class, setup a [`~transformers.TrainingArguments`] class with some training hyperparameters. + +```py +training_args = TrainingArguments( + output_dir="your-name/bigscience/mt0-large-lora", + learning_rate=1e-3, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, +) +``` + +Pass the model, training arguments, dataset, tokenizer, and any other necessary component to the [`~transformers.Trainer`], and call [`~transformers.Trainer.train`] to start training. + +```py +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + processing_class=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() +``` + +### Save model + +After your model is finished training, you can save your model to a directory using the [`~transformers.PreTrainedModel.save_pretrained`] function. + +```py +model.save_pretrained("output_dir") +``` + +You can also save your model to the Hub (make sure you're logged in to your Hugging Face account first) with the [`~transformers.PreTrainedModel.push_to_hub`] function. + +```python +from huggingface_hub import notebook_login + +notebook_login() +model.push_to_hub("your-name/bigscience/mt0-large-lora") +``` + +Both methods only save the extra PEFT weights that were trained, meaning it is super efficient to store, transfer, and load. For example, this [facebook/opt-350m](https://huggingface.co/ybelkada/opt-350m-lora) model trained with LoRA only contains two files: `adapter_config.json` and `adapter_model.safetensors`. The `adapter_model.safetensors` file is just 6.3MB! + +
+ +
The adapter weights for a opt-350m model stored on the Hub are only ~6MB compared to the full size of the model weights, which can be ~700MB.
+
+ +## Inference + +> [!TIP] +> Take a look at the [AutoPeftModel](package_reference/auto_class) API reference for a complete list of available `AutoPeftModel` classes. + +Easily load any PEFT-trained model for inference with the [`AutoPeftModel`] class and the [`~transformers.PreTrainedModel.from_pretrained`] method: + +```py +from peft import AutoPeftModelForCausalLM +from transformers import AutoTokenizer +import torch + +model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") + +model = model.to("cuda") +model.eval() +inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors="pt") + +outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=50) +print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]) + +"Preheat the oven to 350 degrees and place the cookie dough in the center of the oven. In a large bowl, combine the flour, baking powder, baking soda, salt, and cinnamon. In a separate bowl, combine the egg yolks, sugar, and vanilla." +``` + +For other tasks that aren't explicitly supported with an `AutoPeftModelFor` class - such as automatic speech recognition - you can still use the base [`AutoPeftModel`] class to load a model for the task. + +```py +from peft import AutoPeftModel + +model = AutoPeftModel.from_pretrained("smangrul/openai-whisper-large-v2-LORA-colab") +``` + +## Next steps + +Now that you've seen how to train a model with one of the PEFT methods, we encourage you to try out some of the other methods like prompt tuning. The steps are very similar to the ones shown in the quicktour: + +1. prepare a [`PeftConfig`] for a PEFT method +2. use the [`get_peft_model`] method to create a [`PeftModel`] from the configuration and base model + +Then you can train it however you like! To load a PEFT model for inference, you can use the [`AutoPeftModel`] class. + +Feel free to also take a look at the task guides if you're interested in training a model with another PEFT method for a specific task such as semantic segmentation, multilingual automatic speech recognition, DreamBooth, token classification, and more. diff --git a/peft/docs/source/task_guides/ia3.md b/peft/docs/source/task_guides/ia3.md new file mode 100644 index 0000000000000000000000000000000000000000..612a4bb1e72d757211f28fe70cfdafd8be9e448d --- /dev/null +++ b/peft/docs/source/task_guides/ia3.md @@ -0,0 +1,235 @@ + + +# IA3 + +[IA3](../conceptual_guides/ia3) multiplies the model's activations (the keys and values in the self-attention and encoder-decoder attention blocks, and the intermediate activation of the position-wise feedforward network) by three learned vectors. This PEFT method introduces an even smaller number of trainable parameters than LoRA which introduces weight matrices instead of vectors. The original model's parameters are kept frozen and only these vectors are updated. As a result, it is faster, cheaper and more efficient to finetune for a new downstream task. + +This guide will show you how to train a sequence-to-sequence model with IA3 to *generate a sentiment* given some financial news. + +> [!TIP] +> Some familiarity with the general process of training a sequence-to-sequence would be really helpful and allow you to focus on how to apply IA3. If you’re new, we recommend taking a look at the [Translation](https://huggingface.co/docs/transformers/tasks/translation) and [Summarization](https://huggingface.co/docs/transformers/tasks/summarization) guides first from the Transformers documentation. When you’re ready, come back and see how easy it is to drop PEFT in to your training! + +## Dataset + +You'll use the sentences_allagree subset of the [financial_phrasebank](https://huggingface.co/datasets/financial_phrasebank) dataset. This subset contains financial news with 100% annotator agreement on the sentiment label. Take a look at the [dataset viewer](https://huggingface.co/datasets/financial_phrasebank/viewer/sentences_allagree) for a better idea of the data and sentences you'll be working with. + +Load the dataset with the [`~datasets.load_dataset`] function. This subset of the dataset only contains a train split, so use the [`~datasets.train_test_split`] function to create a train and validation split. Create a new `text_label` column so it is easier to understand what the `label` values `0`, `1`, and `2` mean. + +```py +from datasets import load_dataset + +ds = load_dataset("financial_phrasebank", "sentences_allagree") +ds = ds["train"].train_test_split(test_size=0.1) +ds["validation"] = ds["test"] +del ds["test"] + +classes = ds["train"].features["label"].names +ds = ds.map( + lambda x: {"text_label": [classes[label] for label in x["label"]]}, + batched=True, + num_proc=1, +) + +ds["train"][0] +{'sentence': 'It will be operated by Nokia , and supported by its Nokia NetAct network and service management system .', + 'label': 1, + 'text_label': 'neutral'} +``` + +Load a tokenizer and create a preprocessing function that: + +1. tokenizes the inputs, pads and truncates the sequence to the `max_length` +2. apply the same tokenizer to the labels but with a shorter `max_length` that corresponds to the label +3. mask the padding tokens + +```py +from transformers import AutoTokenizer + +text_column = "sentence" +label_column = "text_label" +max_length = 128 + +tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") + +def preprocess_function(examples): + inputs = examples[text_column] + targets = examples[label_column] + model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") + labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt") + labels = labels["input_ids"] + labels[labels == tokenizer.pad_token_id] = -100 + model_inputs["labels"] = labels + return model_inputs +``` + +Use the [`~datasets.Dataset.map`] function to apply the preprocessing function to the entire dataset. + +```py +processed_ds = ds.map( + preprocess_function, + batched=True, + num_proc=1, + remove_columns=ds["train"].column_names, + load_from_cache_file=False, + desc="Running tokenizer on dataset", +) +``` + +Create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader), and set `pin_memory=True` to speed up data transfer to the accelerator during training if your dataset samples are on a CPU. + +```py +from torch.utils.data import DataLoader +from transformers import default_data_collator + +train_ds = processed_ds["train"] +eval_ds = processed_ds["validation"] + +batch_size = 8 + +train_dataloader = DataLoader( + train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True +) +eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) +``` + +## Model + +Now you can load a pretrained model to use as the base model for IA3. This guide uses the [bigscience/mt0-large](https://huggingface.co/bigscience/mt0-large) model, but you can use any sequence-to-sequence model you like. + +```py +from transformers import AutoModelForSeq2SeqLM + +model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") +``` + +### PEFT configuration and model + +All PEFT methods need a configuration that contains and specifies all the parameters for how the PEFT method should be applied. Create an [`IA3Config`] with the task type and set the inference mode to `False`. You can find additional parameters for this configuration in the [API reference](../package_reference/ia3#ia3config). + +> [!TIP] +> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! + +Once the configuration is setup, pass it to the [`get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +```py +from peft import IA3Config, get_peft_model + +peft_config = IA3Config(task_type="SEQ_2_SEQ_LM") +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.022980103060766553" +``` + +### Training + +Set up an optimizer and learning rate scheduler. + +```py +import torch +from transformers import get_linear_schedule_with_warmup + +lr = 8e-3 +num_epochs = 3 + +optimizer = torch.optim.AdamW(model.parameters(), lr=lr) +lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=(len(train_dataloader) * num_epochs), +) +``` + +Move the model to the accelerator and create a training loop that reports the loss and perplexity for each epoch. + +```py +from tqdm import tqdm + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +model = model.to(device) + +for epoch in range(num_epochs): + model.train() + total_loss = 0 + for step, batch in enumerate(tqdm(train_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + loss.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + model.eval() + eval_loss = 0 + eval_preds = [] + for step, batch in enumerate(tqdm(eval_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + with torch.no_grad(): + outputs = model(**batch) + loss = outputs.loss + eval_loss += loss.detach().float() + eval_preds.extend( + tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True) + ) + + eval_epoch_loss = eval_loss / len(eval_dataloader) + eval_ppl = torch.exp(eval_epoch_loss) + train_epoch_loss = total_loss / len(train_dataloader) + train_ppl = torch.exp(train_epoch_loss) + print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") +``` + +## Share your model + +After training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. + +```py +from huggingface_hub import notebook_login + +account = +peft_model_id = f"{account}/mt0-large-ia3" +model.push_to_hub(peft_model_id) +``` + +## Inference + +To load the model for inference, use the [`~AutoPeftModelForSeq2SeqLM.from_pretrained`] method. Let's also load a sentence of financial news from the dataset to generate a sentiment for. + +```py +from peft import AutoPeftModelForSeq2SeqLM + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +model = AutoPeftModelForSeq2SeqLM.from_pretrained("/mt0-large-ia3").to(device) +tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-large") + +i = 15 +inputs = tokenizer(ds["validation"][text_column][i], return_tensors="pt") +print(ds["validation"][text_column][i]) +"The robust growth was the result of the inclusion of clothing chain Lindex in the Group in December 2007 ." +``` + +Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted sentiment label. + +```py +with torch.no_grad(): + inputs = {k: v.to(device) for k, v in inputs.items()} + outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) + print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) +['positive'] +``` diff --git a/peft/docs/source/task_guides/lora_based_methods.md b/peft/docs/source/task_guides/lora_based_methods.md new file mode 100644 index 0000000000000000000000000000000000000000..50be18884841f87dbfa375ba0099c07f8d967efd --- /dev/null +++ b/peft/docs/source/task_guides/lora_based_methods.md @@ -0,0 +1,344 @@ + + +# LoRA methods + +A popular way to efficiently train large models is to insert (typically in the attention blocks) smaller trainable matrices that are a low-rank decomposition of the delta weight matrix to be learnt during finetuning. The pretrained model's original weight matrix is frozen and only the smaller matrices are updated during training. This reduces the number of trainable parameters, reducing memory usage and training time which can be very expensive for large models. + +There are several different ways to express the weight matrix as a low-rank decomposition, but [Low-Rank Adaptation (LoRA)](../conceptual_guides/adapter#low-rank-adaptation-lora) is the most common method. The PEFT library supports several other LoRA variants, such as [Low-Rank Hadamard Product (LoHa)](../conceptual_guides/adapter#low-rank-hadamard-product-loha), [Low-Rank Kronecker Product (LoKr)](../conceptual_guides/adapter#low-rank-kronecker-product-lokr), and [Adaptive Low-Rank Adaptation (AdaLoRA)](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora). You can learn more about how these methods work conceptually in the [Adapters](../conceptual_guides/adapter) guide. If you're interested in applying these methods to other tasks and use cases like semantic segmentation, token classification, take a look at our [notebook collection](https://huggingface.co/collections/PEFT/notebooks-6573b28b33e5a4bf5b157fc1)! + +Additionally, PEFT supports the [X-LoRA](../conceptual_guides/adapter#mixture-of-lora-experts-x-lora) Mixture of LoRA Experts method. + +This guide will show you how to quickly train an image classification model - with a low-rank decomposition method - to identify the class of food shown in an image. + +> [!TIP] +> Some familiarity with the general process of training an image classification model would be really helpful and allow you to focus on the low-rank decomposition methods. If you're new, we recommend taking a look at the [Image classification](https://huggingface.co/docs/transformers/tasks/image_classification) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! + +Before you begin, make sure you have all the necessary libraries installed. + +```bash +pip install -q peft transformers datasets +``` + +## Dataset + +In this guide, you'll use the [Food-101](https://huggingface.co/datasets/food101) dataset which contains images of 101 food classes (take a look at the [dataset viewer](https://huggingface.co/datasets/food101/viewer/default/train) to get a better idea of what the dataset looks like). + +Load the dataset with the [`~datasets.load_dataset`] function. + +```py +from datasets import load_dataset + +ds = load_dataset("food101") +``` + +Each food class is labeled with an integer, so to make it easier to understand what these integers represent, you'll create a `label2id` and `id2label` dictionary to map the integer to its class label. + +```py +labels = ds["train"].features["label"].names +label2id, id2label = dict(), dict() +for i, label in enumerate(labels): + label2id[label] = i + id2label[i] = label + +id2label[2] +"baklava" +``` + +Load an image processor to properly resize and normalize the pixel values of the training and evaluation images. + +```py +from transformers import AutoImageProcessor + +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") +``` + +You can also use the image processor to prepare some transformation functions for data augmentation and pixel scaling. + +```py +from torchvision.transforms import ( + CenterCrop, + Compose, + Normalize, + RandomHorizontalFlip, + RandomResizedCrop, + Resize, + ToTensor, +) + +normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) +train_transforms = Compose( + [ + RandomResizedCrop(image_processor.size["height"]), + RandomHorizontalFlip(), + ToTensor(), + normalize, + ] +) + +val_transforms = Compose( + [ + Resize(image_processor.size["height"]), + CenterCrop(image_processor.size["height"]), + ToTensor(), + normalize, + ] +) + +def preprocess_train(example_batch): + example_batch["pixel_values"] = [train_transforms(image.convert("RGB")) for image in example_batch["image"]] + return example_batch + +def preprocess_val(example_batch): + example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]] + return example_batch +``` + +Define the training and validation datasets, and use the [`~datasets.Dataset.set_transform`] function to apply the transformations on-the-fly. + +```py +train_ds = ds["train"] +val_ds = ds["validation"] + +train_ds.set_transform(preprocess_train) +val_ds.set_transform(preprocess_val) +``` + +Finally, you'll need a data collator to create a batch of training and evaluation data and convert the labels to `torch.tensor` objects. + +```py +import torch + +def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + labels = torch.tensor([example["label"] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} +``` + +## Model + +Now let's load a pretrained model to use as the base model. This guide uses the [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) model, but you can use any image classification model you want. Pass the `label2id` and `id2label` dictionaries to the model so it knows how to map the integer labels to their class labels, and you can optionally pass the `ignore_mismatched_sizes=True` parameter if you're finetuning a checkpoint that has already been finetuned. + +```py +from transformers import AutoModelForImageClassification, TrainingArguments, Trainer + +model = AutoModelForImageClassification.from_pretrained( + "google/vit-base-patch16-224-in21k", + label2id=label2id, + id2label=id2label, + ignore_mismatched_sizes=True, +) +``` + +### PEFT configuration and model + +Every PEFT method requires a configuration that holds all the parameters specifying how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +> [!TIP] +> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of parameters of [`PeftModel`] versus the number of parameters in the base model! + + + + +[LoRA](../conceptual_guides/adapter#low-rank-adaptation-lora) decomposes the weight update matrix into *two* smaller matrices. The size of these low-rank matrices is determined by its *rank* or `r`. A higher rank means the model has more parameters to train, but it also means the model has more learning capacity. You'll also want to specify the `target_modules` which determine where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `lora_alpha` (scaling factor), `bias` (whether `none`, `all` or only the LoRA bias parameters should be trained), and `modules_to_save` (the modules apart from the LoRA layers to be trained and saved). All of these parameters - and more - are found in the [`LoraConfig`]. + +```py +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=16, + target_modules=["query", "value"], + lora_dropout=0.1, + bias="none", + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 667,493 || all params: 86,543,818 || trainable%: 0.7712775047664294" +``` + + + + +[LoHa](../conceptual_guides/adapter#low-rank-hadamard-product-loha) decomposes the weight update matrix into *four* smaller matrices and each pair of smaller matrices is combined with the Hadamard product. This allows the weight update matrix to keep the same number of trainable parameters when compared to LoRA, but with a higher rank (`r^2` for LoHA when compared to `2*r` for LoRA). The size of the smaller matrices is determined by its *rank* or `r`. You'll also want to specify the `target_modules` which determines where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `alpha` (scaling factor), and `modules_to_save` (the modules apart from the LoHa layers to be trained and saved). All of these parameters - and more - are found in the [`LoHaConfig`]. + +```py +from peft import LoHaConfig, get_peft_model + +config = LoHaConfig( + r=16, + alpha=16, + target_modules=["query", "value"], + module_dropout=0.1, + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 1,257,317 || all params: 87,133,642 || trainable%: 1.4429753779831676" +``` + + + + +[LoKr](../conceptual_guides/adapter#low-rank-kronecker-product-lokr) expresses the weight update matrix as a decomposition of a Kronecker product, creating a block matrix that is able to preserve the rank of the original weight matrix. The size of the smaller matrices are determined by its *rank* or `r`. You'll also want to specify the `target_modules` which determines where the smaller matrices are inserted. For this guide, you'll target the *query* and *value* matrices of the attention blocks. Other important parameters to set are `alpha` (scaling factor), and `modules_to_save` (the modules apart from the LoKr layers to be trained and saved). All of these parameters - and more - are found in the [`LoKrConfig`]. + +```py +from peft import LoKrConfig, get_peft_model + +config = LoKrConfig( + r=16, + alpha=16, + target_modules=["query", "value"], + module_dropout=0.1, + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 116,069 || all params: 87,172,042 || trainable%: 0.13314934162033282" +``` + + + + +[AdaLoRA](../conceptual_guides/adapter#adaptive-low-rank-adaptation-adalora) efficiently manages the LoRA parameter budget by assigning important weight matrices more parameters and pruning less important ones. In contrast, LoRA evenly distributes parameters across all modules. You can control the average desired *rank* or `r` of the matrices, and which modules to apply AdaLoRA to with `target_modules`. Other important parameters to set are `lora_alpha` (scaling factor), and `modules_to_save` (the modules apart from the AdaLoRA layers to be trained and saved). All of these parameters - and more - are found in the [`AdaLoraConfig`]. + +```py +from peft import AdaLoraConfig, get_peft_model + +config = AdaLoraConfig( + r=8, + init_r=12, + tinit=200, + tfinal=1000, + deltaT=10, + target_modules=["query", "value"], + modules_to_save=["classifier"], +) +model = get_peft_model(model, config) +model.print_trainable_parameters() +"trainable params: 520,325 || all params: 87,614,722 || trainable%: 0.5938785036606062" +``` + + + + +### Training + +For training, let's use the [`~transformers.Trainer`] class from Transformers. The [`Trainer`] contains a PyTorch training loop, and when you're ready, call [`~transformers.Trainer.train`] to start training. To customize the training run, configure the training hyperparameters in the [`~transformers.TrainingArguments`] class. With LoRA-like methods, you can afford to use a higher batch size and learning rate. + +> [!WARNING] +> AdaLoRA has an [`~AdaLoraModel.update_and_allocate`] method that should be called at each training step to update the parameter budget and mask, otherwise the adaptation step is not performed. This requires writing a custom training loop or subclassing the [`~transformers.Trainer`] to incorporate this method. As an example, take a look at this [custom training loop](https://github.com/huggingface/peft/blob/912ad41e96e03652cabf47522cd876076f7a0c4f/examples/conditional_generation/peft_adalora_seq2seq.py#L120). + +```py +from transformers import TrainingArguments, Trainer + +account = "stevhliu" +peft_model_id = f"{account}/google/vit-base-patch16-224-in21k-lora" +batch_size = 128 + +args = TrainingArguments( + peft_model_id, + remove_unused_columns=False, + eval_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-3, + per_device_train_batch_size=batch_size, + gradient_accumulation_steps=4, + per_device_eval_batch_size=batch_size, + fp16=True, + num_train_epochs=5, + logging_steps=10, + load_best_model_at_end=True, + label_names=["labels"], +) +``` + +Begin training with [`~transformers.Trainer.train`]. + +```py +trainer = Trainer( + model, + args, + train_dataset=train_ds, + eval_dataset=val_ds, + processing_class=image_processor, + data_collator=collate_fn, +) +trainer.train() +``` + +## Share your model + +Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You’ll need to login to your Hugging Face account first and enter your token when prompted. + +```py +from huggingface_hub import notebook_login + +notebook_login() +``` + +Call [`~transformers.PreTrainedModel.push_to_hub`] to save your model to your repositoy. + +```py +model.push_to_hub(peft_model_id) +``` + +## Inference + +Let's load the model from the Hub and test it out on a food image. + +```py +from peft import PeftConfig, PeftModel +from transformers import AutoImageProcessor +from PIL import Image +import requests + +config = PeftConfig.from_pretrained("stevhliu/vit-base-patch16-224-in21k-lora") +model = AutoModelForImageClassification.from_pretrained( + config.base_model_name_or_path, + label2id=label2id, + id2label=id2label, + ignore_mismatched_sizes=True, +) +model = PeftModel.from_pretrained(model, "stevhliu/vit-base-patch16-224-in21k-lora") + +url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg" +image = Image.open(requests.get(url, stream=True).raw) +image +``` + +
+ +
+ +Convert the image to RGB and return the underlying PyTorch tensors. + +```py +encoding = image_processor(image.convert("RGB"), return_tensors="pt") +``` + +Now run the model and return the predicted class! + +```py +with torch.no_grad(): + outputs = model(**encoding) + logits = outputs.logits + +predicted_class_idx = logits.argmax(-1).item() +print("Predicted class:", model.config.id2label[predicted_class_idx]) +"Predicted class: beignets" +``` diff --git a/peft/docs/source/task_guides/prompt_based_methods.md b/peft/docs/source/task_guides/prompt_based_methods.md new file mode 100644 index 0000000000000000000000000000000000000000..eab59066b341febe6c7aa14246fe51ec3a153455 --- /dev/null +++ b/peft/docs/source/task_guides/prompt_based_methods.md @@ -0,0 +1,305 @@ + + +# Prompt-based methods + +A prompt can describe a task or provide an example of a task you want the model to learn. Instead of manually creating these prompts, soft prompting methods add learnable parameters to the input embeddings that can be optimized for a specific task while keeping the pretrained model's parameters frozen. This makes it both faster and easier to finetune large language models (LLMs) for new downstream tasks. + +The PEFT library supports several types of prompting methods (p-tuning, prefix tuning, prompt tuning) and you can learn more about how these methods work conceptually in the [Soft prompts](../conceptual_guides/prompting) guide. If you're interested in applying these methods to other tasks and use cases, take a look at our [notebook collection](https://huggingface.co/spaces/PEFT/soft-prompting)! + +This guide will show you how to train a causal language model - with a soft prompting method - to *generate a classification* for whether a tweet is a complaint or not. + +> [!TIP] +> Some familiarity with the general process of training a causal language model would be really helpful and allow you to focus on the soft prompting methods. If you're new, we recommend taking a look at the [Causal language modeling](https://huggingface.co/docs/transformers/tasks/language_modeling) guide first from the Transformers documentation. When you're ready, come back and see how easy it is to drop PEFT in to your training! + +Before you begin, make sure you have all the necessary libraries installed. + +```bash +pip install -q peft transformers datasets +``` + +## Dataset + +For this guide, you'll use the `twitter_complaints` subset of the [RAFT](https://huggingface.co/datasets/ought/raft) dataset. The `twitter_complaints` subset contains tweets labeled as `complaint` and `no complaint` and you can check out the [dataset viewer](https://huggingface.co/datasets/ought/raft/viewer/twitter_complaints) for a better idea of what the data looks like. + +Use the [`~datasets.load_dataset`] function to load the dataset and create a new `text_label` column so it is easier to understand what the `Label` values, `1` and `2` mean. + +```py +from datasets import load_dataset + +ds = load_dataset( + "parquet", + data_files={ + "train": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/train/0000.parquet", + "test": "hf://datasets/ought/raft@refs/convert/parquet/twitter_complaints/test/0000.parquet" + } +) + +classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] +ds = ds.map( + lambda x: {"text_label": [classes[label] for label in x["Label"]]}, + batched=True, + num_proc=1, +) +ds["train"][0] +{"Tweet text": "@HMRCcustomers No this is my first job", "ID": 0, "Label": 2, "text_label": "no complaint"} +``` + +Load a tokenizer, define the padding token to use, and determine the maximum length of the tokenized label. + +```py +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") +if tokenizer.pad_token_id is None: + tokenizer.pad_token_id = tokenizer.eos_token_id +target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes]) +print(target_max_length) +``` + +Create a preprocessing function that tokenizes the tweet text and labels, pad the inputs and labels in each batch, create an attention mask, and truncate sequences to the `max_length`. Then convert the `input_ids`, `attention_mask`, and `labels` to PyTorch tensors. + +```py +import torch + +max_length = 64 + +def preprocess_function(examples, text_column="Tweet text", label_column="text_label"): + batch_size = len(examples[text_column]) + inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] + targets = [str(x) for x in examples[label_column]] + model_inputs = tokenizer(inputs) + labels = tokenizer(targets) + classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names] + for i in range(batch_size): + sample_input_ids = model_inputs["input_ids"][i] + label_input_ids = labels["input_ids"][i] + model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( + max_length - len(sample_input_ids) + ) + sample_input_ids + model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ + "attention_mask" + ][i] + labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids + model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length]) + model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length]) + labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length]) + model_inputs["labels"] = labels["input_ids"] + return model_inputs +``` + +Apply the preprocessing function to the entire dataset with the [`~datasets.Dataset.map`] function, and remove the unprocessed columns because the model won't need them. + +```py +processed_ds = ds.map( + preprocess_function, + batched=True, + num_proc=1, + remove_columns=ds["train"].column_names, + load_from_cache_file=False, + desc="Running tokenizer on dataset", +) +``` + +Finally, create a training and evaluation [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). You can set `pin_memory=True` to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU. + +```py +from torch.utils.data import DataLoader +from transformers import default_data_collator + +train_ds = processed_ds["train"] +eval_ds = processed_ds["test"] + +batch_size = 16 + +train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) +eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) +``` + +## Model + +Now let's load a pretrained model to use as the base model for the soft prompt method. This guide uses the [bigscience/bloomz-560m](https://huggingface.co/bigscience/bloomz-560m) model, but you can use any causal language model you want. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m") +``` + +### PEFT configuration and model + +For any PEFT method, you'll need to create a configuration which contains all the parameters that specify how the PEFT method should be applied. Once the configuration is setup, pass it to the [`~peft.get_peft_model`] function along with the base model to create a trainable [`PeftModel`]. + +> [!TIP] +> Call the [`~PeftModel.print_trainable_parameters`] method to compare the number of trainable parameters of [`PeftModel`] versus the number of parameters in the base model! + + + + +[P-tuning](../conceptual_guides/prompting#p-tuning) adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence. Create a [`PromptEncoderConfig`] with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters. + +```py +from peft import PromptEncoderConfig, get_peft_model + +peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338" +``` + + + + +[Prefix tuning](../conceptual_guides/prompting#prefix-tuning) adds task-specific parameters in all of the model layers, which are optimized by a separate feed-forward network. Create a [`PrefixTuningConfig`] with the task type and number of virtual tokens to add and learn. + +```py +from peft import PrefixTuningConfig, get_peft_model + +peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014" +``` + + + + +[Prompt tuning](../conceptual_guides/prompting#prompt-tuning) formulates all tasks as a *generation* task and it adds a task-specific prompt to the input which is updated independently. The `prompt_tuning_init_text` parameter specifies how to finetune the model (in this case, it is classifying whether tweets are complaints or not). For the best results, the `prompt_tuning_init_text` should have the same number of tokens that should be predicted. To do this, you can set `num_virtual_tokens` to the number of tokens of the `prompt_tuning_init_text`. + +Create a [`PromptTuningConfig`] with the task type, the initial prompt tuning text to train the model with, the number of virtual tokens to add and learn, and a tokenizer. + +```py +from peft import PromptTuningConfig, PromptTuningInit, get_peft_model + +prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n" +peft_config = PromptTuningConfig( + task_type="CAUSAL_LM", + prompt_tuning_init=PromptTuningInit.TEXT, + num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]), + prompt_tuning_init_text=prompt_tuning_init_text, + tokenizer_name_or_path="bigscience/bloomz-560m", +) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() +"trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358" +``` + + + + +### Training + +Set up an optimizer and learning rate scheduler. + +```py +from transformers import get_linear_schedule_with_warmup + +lr = 3e-2 +num_epochs = 50 + +optimizer = torch.optim.AdamW(model.parameters(), lr=lr) +lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=(len(train_dataloader) * num_epochs), +) +``` + +Move the model to the GPU and create a training loop that reports the loss and perplexity for each epoch. + +```py +from tqdm import tqdm + +device = "cuda" +model = model.to(device) + +for epoch in range(num_epochs): + model.train() + total_loss = 0 + for step, batch in enumerate(tqdm(train_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + loss.backward() + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + model.eval() + eval_loss = 0 + eval_preds = [] + for step, batch in enumerate(tqdm(eval_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + with torch.no_grad(): + outputs = model(**batch) + loss = outputs.loss + eval_loss += loss.detach().float() + eval_preds.extend( + tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True) + ) + + eval_epoch_loss = eval_loss / len(eval_dataloader) + eval_ppl = torch.exp(eval_epoch_loss) + train_epoch_loss = total_loss / len(train_dataloader) + train_ppl = torch.exp(train_epoch_loss) + print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") +``` + +## Share your model + +Once training is complete, you can upload your model to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. You'll need to login to your Hugging Face account first and enter your token when prompted. + +```py +from huggingface_hub import notebook_login + +account = +peft_model_id = f"{account}/bloomz-560-m-peft-method" +model.push_to_hub(peft_model_id) +``` + +If you check the model file size in the repository, you’ll see that it is a lot smaller than a full sized model! + +
+ +
For example, the adapter weights for a opt-350m model stored on the Hub are only ~6MB compared to the full model size which can be ~700MB.
+
+ +## Inference + +Let's load the model for inference and test it out on a tweet! + +```py +from peft import AutoPeftModelForCausalLM + +model = AutoPeftModelForCausalLM.from_pretrained("peft_model_id").to("cuda") +tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m") + +i = 15 +inputs = tokenizer(f'{text_column} : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt") +print(ds["test"][i]["Tweet text"]) +"@NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve?" +``` + +Call the [`~transformers.GenerationMixin.generate`] method to generate the predicted classification label. + +```py +with torch.no_grad(): + inputs = {k: v.to(device) for k, v in inputs.items()} + outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) + print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) +"['Tweet text : @NYTsupport i have complained a dozen times & yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label : complaint']" +``` diff --git a/peft/docs/source/tutorial/peft_integrations.md b/peft/docs/source/tutorial/peft_integrations.md new file mode 100644 index 0000000000000000000000000000000000000000..6892fddbcb051f8d92ea3aa841e5a427dc0586b4 --- /dev/null +++ b/peft/docs/source/tutorial/peft_integrations.md @@ -0,0 +1,152 @@ + + +# PEFT integrations + +PEFT's practical benefits extends to other Hugging Face libraries like [Diffusers](https://hf.co/docs/diffusers) and [Transformers](https://hf.co/docs/transformers). One of the main benefits of PEFT is that an adapter file generated by a PEFT method is a lot smaller than the original model, which makes it super easy to manage and use multiple adapters. You can use one pretrained base model for multiple tasks by simply loading a new adapter finetuned for the task you're solving. Or you can combine multiple adapters with a text-to-image diffusion model to create new effects. + +This tutorial will show you how PEFT can help you manage adapters in Diffusers and Transformers. + +## Diffusers + +Diffusers is a generative AI library for creating images and videos from text or images with diffusion models. LoRA is an especially popular training method for diffusion models because you can very quickly train and share diffusion models to generate images in new styles. To make it easier to use and try multiple LoRA models, Diffusers uses the PEFT library to help manage different adapters for inference. + +For example, load a base model and then load the [artificialguybr/3DRedmond-V1](https://huggingface.co/artificialguybr/3DRedmond-V1) adapter for inference with the [`load_lora_weights`](https://huggingface.co/docs/diffusers/v0.24.0/en/api/loaders/lora#diffusers.loaders.LoraLoaderMixin.load_lora_weights) method. The `adapter_name` argument in the loading method is enabled by PEFT and allows you to set a name for the adapter so it is easier to reference. + +```py +import torch +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 +).to("cuda") +pipeline.load_lora_weights( + "peft-internal-testing/artificialguybr__3DRedmond-V1", + weight_name="3DRedmond-3DRenderStyle-3DRenderAF.safetensors", + adapter_name="3d" +) +image = pipeline("sushi rolls shaped like kawaii cat faces").images[0] +image +``` + +
+ +
+ +Now let's try another cool LoRA model, [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora). All you need to do is load and name this new adapter with `adapter_name`, and use the [`set_adapters`](https://huggingface.co/docs/diffusers/api/loaders/unet#diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters) method to set it as the currently active adapter. + +```py +pipeline.load_lora_weights( + "ostris/super-cereal-sdxl-lora", + weight_name="cereal_box_sdxl_v1.safetensors", + adapter_name="cereal" +) +pipeline.set_adapters("cereal") +image = pipeline("sushi rolls shaped like kawaii cat faces").images[0] +image +``` + +
+ +
+ +Finally, you can call the [`disable_lora`](https://huggingface.co/docs/diffusers/api/loaders/unet#diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora) method to restore the base model. + +```py +pipeline.disable_lora() +``` + +Learn more about how PEFT supports Diffusers in the [Inference with PEFT](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference) tutorial. + +## Transformers + +🤗 [Transformers](https://hf.co/docs/transformers) is a collection of pretrained models for all types of tasks in all modalities. You can load these models for training or inference. Many of the models are large language models (LLMs), so it makes sense to integrate PEFT with Transformers to manage and train adapters. + +Load a base pretrained model to train. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +``` + +Next, add an adapter configuration to specify how to adapt the model parameters. Call the [`~PeftModel.add_adapter`] method to add the configuration to the base model. + +```py +from peft import LoraConfig + +peft_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + task_type="CAUSAL_LM" +) +model.add_adapter(peft_config) +``` + +Now you can train the model with Transformer's [`~transformers.Trainer`] class or whichever training framework you prefer. + +To use the newly trained model for inference, the [`~transformers.AutoModel`] class uses PEFT on the backend to load the adapter weights and configuration file into a base pretrained model. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("peft-internal-testing/opt-350m-lora") +``` + +Alternatively, you can use transformers [Pipelines](https://huggingface.co/docs/transformers/en/main_classes/pipelines) to load the model for conveniently running inference: + +```py +from transformers import pipeline + +model = pipeline("text-generation", "peft-internal-testing/opt-350m-lora") +print(model("Hello World")) +``` + +If you're interested in comparing or using more than one adapter, you can call the [`~PeftModel.add_adapter`] method to add the adapter configuration to the base model. The only requirement is the adapter type must be the same (you can't mix a LoRA and LoHa adapter). + +```py +from transformers import AutoModelForCausalLM +from peft import LoraConfig + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +model.add_adapter(lora_config_1, adapter_name="adapter_1") +``` + +Call [`~PeftModel.add_adapter`] again to attach a new adapter to the base model. + +```py +model.add_adapter(lora_config_2, adapter_name="adapter_2") +``` + +Then you can use [`~PeftModel.set_adapter`] to set the currently active adapter. + +```py +model.set_adapter("adapter_1") +output = model.generate(**inputs) +print(tokenizer.decode(output_disabled[0], skip_special_tokens=True)) +``` + +To disable the adapter, call the [disable_adapters](https://github.com/huggingface/transformers/blob/4e3490f79b40248c53ee54365a9662611e880892/src/transformers/integrations/peft.py#L313) method. + +```py +model.disable_adapters() +``` + +The [enable_adapters](https://github.com/huggingface/transformers/blob/4e3490f79b40248c53ee54365a9662611e880892/src/transformers/integrations/peft.py#L336) can be used to enable the adapters again. + +If you're curious, check out the [Load and train adapters with PEFT](https://huggingface.co/docs/transformers/main/peft) tutorial to learn more. diff --git a/peft/docs/source/tutorial/peft_model_config.md b/peft/docs/source/tutorial/peft_model_config.md new file mode 100644 index 0000000000000000000000000000000000000000..83aa7705da57a7feae5aa0bb6dc4d26cef18c9d8 --- /dev/null +++ b/peft/docs/source/tutorial/peft_model_config.md @@ -0,0 +1,179 @@ + + +# PEFT configurations and models + +The sheer size of today's large pretrained models - which commonly have billions of parameters - presents a significant training challenge because they require more storage space and more computational power to crunch all those calculations. You'll need access to powerful GPUs or TPUs to train these large pretrained models which is expensive, not widely accessible to everyone, not environmentally friendly, and not very practical. PEFT methods address many of these challenges. There are several types of PEFT methods (soft prompting, matrix decomposition, adapters), but they all focus on the same thing, reduce the number of trainable parameters. This makes it more accessible to train and store large models on consumer hardware. + +The PEFT library is designed to help you quickly train large models on free or low-cost GPUs, and in this tutorial, you'll learn how to setup a configuration to apply a PEFT method to a pretrained base model for training. Once the PEFT configuration is setup, you can use any training framework you like (Transformer's [`~transformers.Trainer`] class, [Accelerate](https://hf.co/docs/accelerate), a custom PyTorch training loop). + +## PEFT configurations + +> [!TIP] +> Learn more about the parameters you can configure for each PEFT method in their respective API reference page. + +A configuration stores important parameters that specify how a particular PEFT method should be applied. + +For example, take a look at the following [`LoraConfig`](https://huggingface.co/ybelkada/opt-350m-lora/blob/main/adapter_config.json) for applying LoRA and [`PromptEncoderConfig`](https://huggingface.co/smangrul/roberta-large-peft-p-tuning/blob/main/adapter_config.json) for applying p-tuning (these configuration files are already JSON-serialized). Whenever you load a PEFT adapter, it is a good idea to check whether it has an associated adapter_config.json file which is required. + + + + +```json +{ + "base_model_name_or_path": "facebook/opt-350m", #base model to apply LoRA to + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", #PEFT method type + "r": 16, + "revision": null, + "target_modules": [ + "q_proj", #model modules to apply LoRA to (query and value projection layers) + "v_proj" + ], + "task_type": "CAUSAL_LM" #type of task to train model on +} +``` + +You can create your own configuration for training by initializing a [`LoraConfig`]. + +```py +from peft import LoraConfig, TaskType + +lora_config = LoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + task_type=TaskType.CAUSAL_LM, + lora_alpha=32, + lora_dropout=0.05 +) +``` + + + + +```json +{ + "base_model_name_or_path": "roberta-large", #base model to apply p-tuning to + "encoder_dropout": 0.0, + "encoder_hidden_size": 128, + "encoder_num_layers": 2, + "encoder_reparameterization_type": "MLP", + "inference_mode": true, + "num_attention_heads": 16, + "num_layers": 24, + "num_transformer_submodules": 1, + "num_virtual_tokens": 20, + "peft_type": "P_TUNING", #PEFT method type + "task_type": "SEQ_CLS", #type of task to train model on + "token_dim": 1024 +} +``` + +You can create your own configuration for training by initializing a [`PromptEncoderConfig`]. + +```py +from peft import PromptEncoderConfig, TaskType + +p_tuning_config = PromptEncoderConfig( + encoder_reparameterization_type="MLP", + encoder_hidden_size=128, + num_attention_heads=16, + num_layers=24, + num_transformer_submodules=1, + num_virtual_tokens=20, + token_dim=1024, + task_type=TaskType.SEQ_CLS +) +``` + + + + +## PEFT models + +With a PEFT configuration in hand, you can now apply it to any pretrained model to create a [`PeftModel`]. Choose from any of the state-of-the-art models from the [Transformers](https://hf.co/docs/transformers) library, a custom model, and even new and unsupported transformer architectures. + +For this tutorial, load a base [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model to finetune. + +```py +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +``` + +Use the [`get_peft_model`] function to create a [`PeftModel`] from the base facebook/opt-350m model and the `lora_config` you created earlier. + +```py +from peft import get_peft_model + +lora_model = get_peft_model(model, lora_config) +lora_model.print_trainable_parameters() +"trainable params: 1,572,864 || all params: 332,769,280 || trainable%: 0.472659014678278" +``` + +> [!WARNING] +> When calling [`get_peft_model`], the base model will be modified *in-place*. That means, when calling [`get_peft_model`] on a model that was already modified in the same way before, this model will be further mutated. Therefore, if you would like to modify your PEFT configuration after having called [`get_peft_model()`] before, you would first have to unload the model with [`~LoraModel.unload`] and then call [`get_peft_model()`] with your new configuration. Alternatively, you can re-initialize the model to ensure a fresh, unmodified state before applying a new PEFT configuration. + +Now you can train the [`PeftModel`] with your preferred training framework! After training, you can save your model locally with [`~PeftModel.save_pretrained`] or upload it to the Hub with the [`~transformers.PreTrainedModel.push_to_hub`] method. + +```py +# save locally +lora_model.save_pretrained("your-name/opt-350m-lora") + +# push to Hub +lora_model.push_to_hub("your-name/opt-350m-lora") +``` + +To load a [`PeftModel`] for inference, you'll need to provide the [`PeftConfig`] used to create it and the base model it was trained from. + +```py +from peft import PeftModel, PeftConfig + +config = PeftConfig.from_pretrained("ybelkada/opt-350m-lora") +model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path) +lora_model = PeftModel.from_pretrained(model, "ybelkada/opt-350m-lora") +``` + +> [!TIP] +> By default, the [`PeftModel`] is set for inference, but if you'd like to train the adapter some more you can set `is_trainable=True`. +> +> ```py +> lora_model = PeftModel.from_pretrained(model, "ybelkada/opt-350m-lora", is_trainable=True) +> ``` + +The [`PeftModel.from_pretrained`] method is the most flexible way to load a [`PeftModel`] because it doesn't matter what model framework was used (Transformers, timm, a generic PyTorch model). Other classes, like [`AutoPeftModel`], are just a convenient wrapper around the base [`PeftModel`], and makes it easier to load PEFT models directly from the Hub or locally where the PEFT weights are stored. + +```py +from peft import AutoPeftModelForCausalLM + +lora_model = AutoPeftModelForCausalLM.from_pretrained("ybelkada/opt-350m-lora") +``` + +Take a look at the [AutoPeftModel](package_reference/auto_class) API reference to learn more about the [`AutoPeftModel`] classes. + +## Next steps + +With the appropriate [`PeftConfig`], you can apply it to any pretrained model to create a [`PeftModel`] and train large powerful models faster on freely available GPUs! To learn more about PEFT configurations and models, the following guide may be helpful: + +* Learn how to configure a PEFT method for models that aren't from Transformers in the [Working with custom models](../developer_guides/custom_models) guide. diff --git a/peft/examples/alora_finetuning/README.md b/peft/examples/alora_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6b8da0bcd8a68a3665b7d4a999f9c6983ab08bf --- /dev/null +++ b/peft/examples/alora_finetuning/README.md @@ -0,0 +1,76 @@ +# Activated LoRA (aLoRA) + +## Introduction +Activated LoRA (aLoRA) is an adapter that selectively activates its weights only after a given invocation sequence, ensuring that hidden states match the base model prior to this point. This allows reusing the base model KVs (stored in the KV cache) for tokens before the invocation, +enabling much faster real-world inference (e.g. vLLM) when switching between generation with the base model and generation with adapters. +See the [paper](https://huggingface.co/papers/2504.12397) for more details. + +## Quick start (shown for Mistral 7B) +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, DataCollatorForLanguageModeling +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="cuda") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") +dataset = load_dataset("Lots-of-LoRAs/task1660_super_glue_question_generation", split="train") + +invocation_string = "[/INST]" # End of user turn in Mistral chat template +invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) + +lora_config = LoraConfig( + task_type="CAUSAL_LM", + alora_invocation_tokens=invocation_tokens, + r=32, + target_modules=["q_proj", "k_proj", "v_proj"], +) + +peft_model = get_peft_model(model, lora_config) +data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) +trainer = Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + tokenizer=tokenizer, + data_collator=data_collator, +) +trainer.train() +peft_model.save_pretrained("alora-mistral-7b") +``` + +### Use the training example script directly +Pass the invocation string with `--invocation_string` when running the training example +script. For Mistral 7B, do: +```bash +python examples/alora_finetuning/alora_finetuning.py --base_model mistralai/Mistral-7B-Instruct-v0.3 --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "[/INST]" +``` +and similarly for Llama-3.2-3B-Instruct: +```bash +python examples/alora_finetuning/alora_finetuning.py --base_model meta-llama/Llama-3.2-3B-Instruct --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "<|start_header_id|>assistant<|end_header_id|>" +``` + +### Full example of the script +```bash +python alora_finetuning.py \ + --base_model "PATH_TO_MODEL" \ + --data_path "PATH_TO_DATASET" \ + --output_dir "PATH_TO_OUTPUT_DIR" \ + --batch_size 1 \ + --num_epochs 3 \ + --learning_rate 3e-4 \ + --cutoff_len 512 \ + --val_set_size 500 \ + --invocation_string "[/INST]" \ + --quantize \ + --eval_step 10 \ + --save_step 100 \ + --device "cuda:0" \ + --lora_r 32 \ + --lora_alpha 32 \ + --lora_dropout 0.05 \ + --lora_target_modules "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" \ + --hub_model_id "YOUR_HF_REPO" \ + --push_to_hub +``` diff --git a/peft/examples/alora_finetuning/alora_finetuning.py b/peft/examples/alora_finetuning/alora_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..fb7073d6f02dfd468ce794d41faa66b324f926b7 --- /dev/null +++ b/peft/examples/alora_finetuning/alora_finetuning.py @@ -0,0 +1,251 @@ +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + invocation_string: str, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + lora_r: int, + lora_alpha: int, + lora_dropout: float, + lora_target_modules: str, + hub_model_id: str, + push_to_hub: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + device = torch.device(device) + print(f"Using device: {device}") + + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + tokenizer.pad_token = tokenizer.unk_token + invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) + + if quantize: + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=( + torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 + ), + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + ) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token) + + lora_config = LoraConfig( + task_type="CAUSAL_LM", + alora_invocation_tokens=invocation_tokens, + r=lora_r, + lora_alpha=lora_alpha, + target_modules=(lora_target_modules.split(",") if lora_target_modules else ["q_proj", "k_proj", "v_proj"]), + lora_dropout=lora_dropout, + bias="none", + ) + + model = get_peft_model(model, lora_config) + + model.to(device) + tokenizer.pad_token = tokenizer.eos_token + + dataset = load_dataset(data_path) + + def tokenize_function(examples): + formatted_texts = [ + tokenizer.apply_chat_template( + [ + {"role": "user", "content": user_msg}, + {"role": "assistant", "content": assistant_msg}, + ], + tokenize=False, # get plain text first + add_generation_prompt=False, + ) + for user_msg, assistant_msg in zip(examples["input"], examples["output"]) + ] + + # 2) Tokenize those texts + model_inputs = tokenizer( + formatted_texts, + padding="max_length", + truncation=True, + max_length=cutoff_len, + ) + + labels = [] + for ids in model_inputs["input_ids"]: + labels.append([(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in ids]) + model_inputs["labels"] = labels + + return model_inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + ) + + torch.cuda.empty_cache() + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + trainer.train() + + if push_to_hub: + trainer.push_to_hub(commit_message="Fine-tuned model") + + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +def model_inference(model_path: str, adapter_path: str, prompt: str = None, data_path: str = None): + """ + Simple inference with the tuned aLoRA adapter. Optionally (reuse_cache = True) demonstrates + that the aLoRA adapter can (but does not need to) use KV cache created by the base model, + perhaps during a prior generation turn. + + Purely for demonstration purposes. See the [paper](https://huggingface.co/papers/2504.12397) + for realistic multiturn cache reuse examples. + """ + if prompt is None: + # Use first row of test data + dataset = load_dataset(data_path) + prompt = dataset["test"][0]["input"] + tokenizer = AutoTokenizer.from_pretrained(model_path) + base_model = AutoModelForCausalLM.from_pretrained(model_path) + alora_model = PeftModel.from_pretrained(base_model, adapter_path) + chat = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(base_model.device) + + # Generate answer with adapter + output_dict = alora_model.generate(**inputs, return_dict_in_generate=True, max_new_tokens=20) + alora_outputs = output_dict.sequences + + # Print results + print(f"Prompt: {text}") + response = tokenizer.decode(alora_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) + print(f"Trained adapter response: {response}") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune Mistral with Activated LoRA") + parser.add_argument( + "--base_model", type=str, default="mistralai/Mistral-7B-Instruct-v0.3", help="Base model path or name" + ) + parser.add_argument( + "--data_path", + type=str, + default="Lots-of-LoRAs/task1660_super_glue_question_generation", + help="Dataset path or name", + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=2, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=2048, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument( + "--invocation_string", + type=str, + default="[/INST]", + help="String that activates the aLoRA adapter. Model dependent.", + ) + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training") + parser.add_argument("--lora_r", type=int, default=32, help="LoRA rank") + parser.add_argument("--lora_alpha", type=int, default=32, help="LoRA alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + invocation_string=args.invocation_string, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + lora_r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + ) + print("Model trained. Running test inference.") + model_inference(model_path=args.base_model, adapter_path=args.output_dir, data_path=args.data_path) diff --git a/peft/examples/arrow_multitask/arrow_phi3_mini.py b/peft/examples/arrow_multitask/arrow_phi3_mini.py new file mode 100644 index 0000000000000000000000000000000000000000..a249f7c67585478e04fbc25c3b9833fe0d7384cf --- /dev/null +++ b/peft/examples/arrow_multitask/arrow_phi3_mini.py @@ -0,0 +1,375 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script provides a simple evaluation pipeline for multiple-choice reasoning datasets +(e.g., BoolQ, HellaSwag, ARC, OpenBookQA, Winogrande) with different composition strategies. + +Usage examples: + python arrow_phi3_mini.py --strategy base --ds_name arc-challenge + python arrow_phi3_mini.py --strategy arrow --ds_name boolq + python arrow_phi3_mini.py --strategy gks --ds_name hswag + +Key features: +- Supports three strategies: + • "base" → Evaluate the quantized base model directly + • "arrow" → Use Arrow modular routing with task-specific adapters + • "gks" → Use Arrow + GenKnowSub (subtracting general-domain knowledge) +- Loads evaluation datasets from the Hugging Face Hub +- Implements a batched evaluation loop that computes per-option likelihoods and selects + the answer with the lowest average loss +- Reports simple accuracy + +Implementation details: +- The base model is quantized to 4-bit using `BitsAndBytesConfig` (nf4, bf16 compute). +- For Arrow and GKS, task-specific adapters are loaded from the Hugging Face Hub: + TahaBa/phi3-mini-clustered-flan/ts_expert_i +- Task-specific adapters were trained on 10 clusters of FLAN tasks. +- The clusters were created using Model-Based Clustering (MBC): + 1. Train a LoRA adapter for each individual task. + 2. Apply k-means clustering to group tasks based on these adapters. + 3. Train a LoRA adapter for each resulting cluster. +For more details, see the Arrow paper: https://huggingface.co/papers/2405.11157 + +- For GKS, general adapters are loaded from: + TahaBa/phi3-mini-general-adapters/... +- These adapters were trained on English, French, and German Wikipedia data + using a causal language modeling objective with (507-token context → 5-token completion) pairs. +- This setup encodes general knowledge into the LoRA space, which can then be + subtracted from task-specific adapters during inference to isolate and purify them. +For more details, see the GenKnowSub paper: https://huggingface.co/papers/2505.10939 + +- `evaluate_on_multi_choice_batched` handles tokenization, masking context tokens, + and computing per-choice log-likelihoods for fair comparison. +- Accuracy is printed at the end for the selected dataset. + +This script is mainly meant for demonstration purposes and lightweight evaluation, +not full-scale benchmarking (batch size / max length can be tuned). + +======================================================================================= + +Results (evaluated with microsoft/Phi-3-mini-4k-instruct, 4-bit quantization): + +| Dataset | Base Acc. | Arrow Acc. | Arrow+GKS Acc. | +|--------------|-----------|------------|----------------| +| ARC-Challenge| 0.4515 | 0.5418 | 0.5585 | +| ARC-Easy | 0.6894 | 0.8404 | 0.8473 | +| Winogrande | 0.5769 | 0.6550 | 0.6724 | +| BoolQ | 0.8146 | 0.8030 | 0.8247 | +| OpenBookQA | 0.43 | 0.448 | 0.472 | +| HellaSwag | 0.7318 | 0.7150 | 0.7376 | + +Observations: +- Arrow generally improves over the base model by routing tokens to the most relevant task adapters. +- Applying GKS (general knowledge subtraction) consistently gives further gains compared to Arrow and Base. + +These numbers are not meant as leaderboard results, but as a sanity check +to verify that the implementation works as expected and demonstrates +the benefits of Arrow and GenKnowSub. +""" + +import argparse +import random + +import numpy as np +import torch +from datasets import load_dataset +from sklearn.metrics import accuracy_score +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +from peft import ArrowConfig, create_arrow_model + + +MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" +MODEL_MAX_LEN = 2048 + + +def parse_args(): + parser = argparse.ArgumentParser(description="Training script with strategy selection") + + parser.add_argument( + "--strategy", + type=str, + choices=["base", "arrow", "gks"], + default="base", + help="Training strategy to use: base, arrow, or gks", + ) + parser.add_argument( + "--ds_name", + type=str, + choices=["boolq", "hswag", "arc-easy", "arc-challenge", "oqa", "wg"], + default="arc-challenge", + help="Dataset to use: boolq, hswag, arc-easy, arc-challenge, oqa, wg", + ) + + return parser.parse_args() + + +def read_test_dataset(ds_name): + if ds_name == "boolq": + ds = load_dataset("google/boolq", split="validation", trust_remote_code=True) + elif ds_name == "hswag": + ds = load_dataset("Rowan/hellaswag", split="validation", trust_remote_code=True) + elif ds_name == "arc-challenge": + ds = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="validation", trust_remote_code=True) + elif ds_name == "arc-easy": + ds = load_dataset("allenai/ai2_arc", "ARC-Easy", split="validation", trust_remote_code=True) + elif ds_name == "oqa": + ds = load_dataset("allenai/openbookqa", split="validation", trust_remote_code=True) + elif ds_name == "wg": + ds = load_dataset("allenai/winogrande", "winogrande_xl", split="validation", trust_remote_code=True) + else: + raise f"Dataset {ds_name} is not supported yet." + + return ds + + +def extract_input_content(ds_name, row): + if ds_name == "boolq": + return f"[passage]{row['passage']}[question]{row['question']}" + if ds_name == "hswag": + return row["ctx"] + if (ds_name == "arc-challenge") or (ds_name == "arc-easy"): + return row["question"] + if ds_name == "oqa": + return row["question_stem"] + if ds_name == "wg": + return row["sentence"] + + +def create_multi_choice_options(row, ds_name): + options_texts = [] + content = extract_input_content(ds_name, row) + if ds_name == "boolq": + choices = ["true", "false"] + if ds_name == "hswag": + choices = row["endings"] + if (ds_name == "arc-challenge") or (ds_name == "arc-easy"): + choices = row["choices"]["text"] + if ds_name == "wg": + choices = [row["option1"], row["option2"]] + if ds_name == "oqa": + choices = row["choices"]["text"] + + for choice in choices: + options_texts.append(f"<|user|>\n{content}<|end|>\n<|assistant|>{choice}<|end|>\n") + + return options_texts + + +def extract_multi_choice_target_index(row, ds_name): + if ds_name == "boolq": + return 0 if row["answer"] is True else 1 + if ds_name == "hswag": + return int(row["label"]) + if (ds_name == "arc-challenge") or (ds_name == "arc-easy"): + return row["choices"]["label"].index(row["answerKey"]) + if ds_name == "wg": + return int(row["answer"]) - 1 + if ds_name == "oqa": + return row["choices"]["label"].index(row["answerKey"]) + + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def compute_loglike_loss(logits, labels, reduction="none"): + bs = logits.size(0) + vocab_size = logits.size(-1) + labels = labels.squeeze(-1) + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + + # Flatten the tokens + loss_fct = torch.nn.CrossEntropyLoss(reduction=reduction) + shift_logits = shift_logits.view(-1, vocab_size) + shift_labels = shift_labels.view(-1) + + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + # reshape back + if reduction == "none": + loss = loss.view((bs, -1)) + non_zero_loss = (loss != 0).sum(dim=-1) + non_zero_loss[non_zero_loss == 0] = 1 + loss = loss.sum(dim=-1) / non_zero_loss + + return loss.float() # Convert to float32 before returning + + +def evaluate_on_multi_choice_batched( + eval_dataset, model, tokenizer, ds_name, labels, predictions, args, batch_size=32, max_length=512, device="cuda" +): + # Local import to mirror your original function + model.eval() + + for start in tqdm( + range(0, len(eval_dataset), batch_size), total=(len(eval_dataset) + batch_size - 1) // batch_size + ): + rows = [eval_dataset[i] for i in range(start, min(start + batch_size, len(eval_dataset)))] + + # Build the flattened option texts for this batch + all_texts = [] + options_per_sample = [] # number of options for each sample + ctx_lens_per_option = [] # context length replicated per option + + for row in rows: + # options: ["<|user|>...<|assistant|>choiceA<|end|>", ...] + options = create_multi_choice_options(row, ds_name) + options_per_sample.append(len(options)) + + # compute context length once per sample (align with your -1 shift) + content = extract_input_content(ds_name, row) + context_prompt = f"<|user|>\n{content}<|end|>\n<|assistant|>" + ctx_len = len(tokenizer.encode(context_prompt)) - 1 + + all_texts.extend(options) + ctx_lens_per_option.extend([ctx_len] * len(options)) + + # collect gold label + labels.append(extract_multi_choice_target_index(row, ds_name)) + + # Tokenize all options in one go + tokenized = tokenizer( + all_texts, + return_tensors="pt", + padding=True, + truncation=True, + max_length=max_length, + ) + tokenized = {k: v.to(device) for k, v in tokenized.items()} + + # Create masked labels: ignore context and padding + masked_labels = tokenized["input_ids"].clone() + for i, ctx_len in enumerate(ctx_lens_per_option): + masked_labels[i, :ctx_len] = -100 + masked_labels[tokenized["attention_mask"] == 0] = -100 + + with torch.no_grad(): + logits = model(input_ids=tokenized["input_ids"], attention_mask=tokenized["attention_mask"]).logits + # per-sequence losses + losses = compute_loglike_loss(logits, masked_labels, reduction="none").detach().cpu() + + # Reduce per sample (argmin across its options) + idx = 0 + for n_opt in options_per_sample: + pred = torch.argmin(losses[idx : idx + n_opt]).item() + predictions.append(pred) + idx += n_opt + + print( + f"Accuracy for dataset {args.ds_name} and strategy {args.strategy} is: {accuracy_score(labels, predictions)}" + ) + + +if __name__ == "__main__": + args = parse_args() + print(f"Selected strategy: {args.strategy}") + print(f"Dataset name: {args.ds_name}") + + # Loading the tokeniser + tokenizer = AutoTokenizer.from_pretrained( + MODEL_NAME, + use_fast=True, + padding_side="right", + model_max_length=MODEL_MAX_LEN, + ) + + # Quantisation config + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=False, + ) + + # Loading the model + base_model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=bnb_config, + ) + + # Loading the test dataset + test_dataset = read_test_dataset(args.ds_name) + print(f"{args.ds_name} is loaded with size: {len(test_dataset)}.") + + labels, predictions = [], [] + if args.strategy == "base": + # Batch-wise inference + with torch.no_grad(): + evaluate_on_multi_choice_batched( + test_dataset, + base_model, + tokenizer, + args.ds_name, + labels, + predictions, + args, + batch_size=64, # tune this + max_length=512, # tune if options are long + device="cuda", + ) + else: + general_adapter_paths = [] + if args.strategy == "gks": + arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + use_gks=True, + ) + # General adapter paths from the hub + general_adapter_paths = [ + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35", + "TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17", + ] + else: + arrow_config = ArrowConfig( + top_k=3, + router_temperature=1.0, + ) + + # Task-specific adapter paths from the hub + task_specific_adapter_paths = [f"TahaBa/phi3-mini-clustered-flan/ts_expert_{i}" for i in range(10)] + + # Creating the Arrow model + model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=task_specific_adapter_paths, + general_adapter_paths=general_adapter_paths, + arrow_config=arrow_config, + ) + + # Batch-wise inference + with torch.no_grad(): + evaluate_on_multi_choice_batched( + test_dataset, + model, + tokenizer, + args.ds_name, + labels, + predictions, + args, + batch_size=32, # tune this + max_length=512, # tune if options are long + device="cuda", + ) diff --git a/peft/examples/arrow_multitask/requirements.txt b/peft/examples/arrow_multitask/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb1d2c4152f707035f9609a6d86319deda58e1c --- /dev/null +++ b/peft/examples/arrow_multitask/requirements.txt @@ -0,0 +1,8 @@ +torch +transformers +accelerate +datasets +scikit-learn +tqdm +numpy +bitsandbytes diff --git a/peft/examples/boft_controlnet/__init__.py b/peft/examples/boft_controlnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/examples/boft_controlnet/boft_controlnet.md b/peft/examples/boft_controlnet/boft_controlnet.md new file mode 100644 index 0000000000000000000000000000000000000000..57777f18591e5e4aff0fe312dff636c687cd4ef4 --- /dev/null +++ b/peft/examples/boft_controlnet/boft_controlnet.md @@ -0,0 +1,177 @@ + + + +# Fine-tuning for controllable generation with BOFT (ControlNet) + +This guide demonstrates how to use BOFT, an orthogonal fine-tuning method, to fine-tune Stable Diffusion with either `stabilityai/stable-diffusion-2-1` or `runwayml/stable-diffusion-v1-5` model for controllable generation. + +By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT parameters can be merged into the original model, eliminating any additional computational costs. + +As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://huggingface.co/papers/2311.06243) and the [original OFT paper](https://huggingface.co/papers/2306.07280). + +In this guide we provide a controllable generation (ControlNet) fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/boft_controlnet). This implementation is adapted from [diffusers's ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) and [Hecong Wu's ControlLoRA](https://github.com/HighCWu/ControlLoRA). You can try it out and finetune on your custom images. + +## Set up your environment +Start by cloning the PEFT repository: + +```bash +git clone https://github.com/huggingface/peft +``` + +Navigate to the directory containing the training scripts for fine-tuning Dreambooth with BOFT: +```bash +cd peft/examples/boft_controlnet +``` + +Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. + +```bash +conda create --name peft python=3.10 +conda activate peft +conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia +conda install xformers -c xformers +pip install -r requirements.txt +pip install git+https://github.com/huggingface/peft +``` + +## Data + +We use the [control-celeba-hq](https://huggingface.co/datasets/oftverse/control-celeba-hq) dataset for landmark-to-face controllable generation. We also provide evaluation scripts to evaluate the controllable generation performance. This task can be used to quantitatively compare different fine-tuning techniques. + +```bash +export DATASET_NAME="oftverse/control-celeba-hq" +``` + +## Train controllable generation (ControlNet) with BOFT + +Start with setting some hyperparameters for BOFT: +```bash +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=0 +``` + +Here: + + +Navigate to the directory containing the training scripts for fine-tuning Stable Diffusion with BOFT for controllable generation: + +```bash +./train_controlnet.sh +``` +or +```bash +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export DATASET_NAME="oftverse/control-celeba-hq" +export PROJECT_NAME="controlnet_${PEFT_TYPE}" +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export CONTROLNET_PATH="" +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}" + +accelerate launch train_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --resume_from_checkpoint=$RESUME_PATH \ + --controlnet_model_name_or_path=$CONTROLNET_PATH \ + --output_dir=$OUTPUT_DIR \ + --report_to="wandb" \ + --dataset_name=$DATASET_NAME \ + --resolution=512 \ + --learning_rate=1e-5 \ + --checkpointing_steps=5000 \ + --max_train_steps=50000 \ + --validation_steps=2000 \ + --num_validation_images=12 \ + --train_batch_size=4 \ + --dataloader_num_workers=2 \ + --seed="0" \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --wandb_project_name=$PROJECT_NAME \ + --wandb_run_name=$RUN_NAME \ + --enable_xformers_memory_efficient_attention \ + --use_boft \ + --boft_block_num=$BLOCK_NUM \ + --boft_block_size=$BLOCK_SIZE \ + --boft_n_butterfly_factor=$N_BUTTERFLY_FACTOR \ + --boft_dropout=0.1 \ + --boft_bias="boft_only" \ + --report_to="wandb" \ +``` + +Run inference on the saved model to sample new images from the validation set: + +```bash +./test_controlnet.sh +``` +or +```bash +ITER_NUM=50000 + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export DATASET_NAME="oftverse/control-celeba-hq" +export CKPT_NAME="checkpoint-${ITER_NUM}" +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}/${CKPT_NAME}" +export CONTROLNET_PATH="${OUTPUT_DIR}/controlnet/model.safetensors" +export UNET_PATH="${OUTPUT_DIR}/unet/${RUN_NAME}" +export RESULTS_PATH="${OUTPUT_DIR}/results" + +accelerate launch test_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$DATASET_NAME \ + --controlnet_path=$CONTROLNET_PATH \ + --unet_path=$UNET_PATH \ + --adapter_name=$RUN_NAME \ + --output_dir=$RESULTS_PATH \ + --dataset_name=$DATASET_NAME \ + +``` + +Run evaluation on the sampled images to evaluate the landmark reprojection error: + +```bash +./eval.sh +``` +or +```bash +ITER_NUM=50000 + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export DATASET_NAME="oftverse/control-celeba-hq" +export CKPT_NAME="checkpoint-${ITER_NUM}" +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}/${CKPT_NAME}" +export CONTROLNET_PATH="${OUTPUT_DIR}/controlnet/model.safetensors" +export UNET_PATH="${OUTPUT_DIR}/unet/${RUN_NAME}" + +accelerate launch eval.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$DATASET_NAME \ + --controlnet_path=$CONTROLNET_PATH \ + --unet_path=$UNET_PATH \ + --adapter_name=$RUN_NAME \ + --output_dir=$OUTPUT_DIR \ + --dataset_name=$DATASET_NAME \ + --vis_overlays \ +``` diff --git a/peft/examples/boft_controlnet/eval.py b/peft/examples/boft_controlnet/eval.py new file mode 100644 index 0000000000000000000000000000000000000000..ea53af7698e8195cc165a3726c7a5ab34f5d3bb4 --- /dev/null +++ b/peft/examples/boft_controlnet/eval.py @@ -0,0 +1,206 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +import glob +import os +from pathlib import Path + +import cv2 +import face_alignment +import numpy as np +import torch +from accelerate import Accelerator +from skimage.io import imread +from torchvision.utils import save_image +from tqdm import tqdm +from transformers import AutoTokenizer +from utils.args_loader import parse_args +from utils.dataset import make_dataset + + +# Determine the best available device +if torch.cuda.is_available(): + device = "cuda:0" +else: + # TODO: xpu support in facealignment will be ready after this PR is merged:https://github.com/1adrianb/face-alignment/pull/371 + device = "cpu" + +detect_model = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, device=device, flip_input=False) +# with open('./data/celebhq-text/prompt_val_blip_full.json', 'rt') as f: # fill50k, COCO +# for line in f: +# val_data = json.loads(line) + +end_list = np.array([17, 22, 27, 42, 48, 31, 36, 68], dtype=np.int32) - 1 + + +def count_txt_files(directory): + pattern = os.path.join(directory, "*.txt") + txt_files = glob.glob(pattern) + return len(txt_files) + + +def plot_kpts(image, kpts, color="g"): + """Draw 68 key points + Args: + image: the input image + kpt: (68, 3). + """ + if color == "r": + c = (255, 0, 0) + elif color == "g": + c = (0, 255, 0) + elif color == "b": + c = (255, 0, 0) + image = image.copy() + kpts = kpts.copy() + radius = max(int(min(image.shape[0], image.shape[1]) / 200), 1) + for i in range(kpts.shape[0]): + st = kpts[i, :2] + if kpts.shape[1] == 4: + if kpts[i, 3] > 0.5: + c = (0, 255, 0) + else: + c = (0, 0, 255) + image = cv2.circle(image, (int(st[0]), int(st[1])), radius, c, radius * 2) + if i in end_list: + continue + ed = kpts[i + 1, :2] + image = cv2.line(image, (int(st[0]), int(st[1])), (int(ed[0]), int(ed[1])), (255, 255, 255), radius) + return image + + +def generate_landmark2d(dataset, input_dir, pred_lmk_dir, gt_lmk_dir, vis=False): + print("Generate 2d landmarks ...") + os.makedirs(pred_lmk_dir, exist_ok=True) + + imagepath_list = sorted(glob.glob(f"{input_dir}/pred*.png")) + + for imagepath in tqdm(imagepath_list): + name = Path(imagepath).stem + idx = int(name.split("_")[-1]) + pred_txt_path = os.path.join(pred_lmk_dir, f"{idx}.txt") + gt_lmk_path = os.path.join(gt_lmk_dir, f"{idx}_gt_lmk.jpg") + gt_txt_path = os.path.join(gt_lmk_dir, f"{idx}.txt") + gt_img_path = os.path.join(gt_lmk_dir, f"{idx}_gt_img.jpg") + + if (not os.path.exists(pred_txt_path)) or (not os.path.exists(gt_txt_path)): + image = imread(imagepath) # [:, :, :3] + out = detect_model.get_landmarks(image) + if out is None: + continue + + pred_kpt = out[0].squeeze() + np.savetxt(pred_txt_path, pred_kpt) + + # Your existing code for obtaining the image tensor + gt_lmk_img = dataset[idx]["conditioning_pixel_values"] + save_image(gt_lmk_img, gt_lmk_path) + + gt_img = (dataset[idx]["pixel_values"]) * 0.5 + 0.5 + save_image(gt_img, gt_img_path) + + gt_img = (gt_img.permute(1, 2, 0) * 255).type(torch.uint8).cpu().numpy() + out = detect_model.get_landmarks(gt_img) + if out is None: + continue + + gt_kpt = out[0].squeeze() + np.savetxt(gt_txt_path, gt_kpt) + # gt_image = cv2.resize(cv2.imread(gt_lmk_path), (512, 512)) + + if vis: + gt_lmk_image = cv2.imread(gt_lmk_path) + + # visualize predicted landmarks + vis_path = os.path.join(pred_lmk_dir, f"{idx}_overlay.jpg") + image = cv2.imread(imagepath) + image_point = plot_kpts(image, pred_kpt) + cv2.imwrite(vis_path, np.concatenate([image_point, gt_lmk_image], axis=1)) + + # visualize gt landmarks + vis_path = os.path.join(gt_lmk_dir, f"{idx}_overlay.jpg") + image = cv2.imread(gt_img_path) + image_point = plot_kpts(image, gt_kpt) + cv2.imwrite(vis_path, np.concatenate([image_point, gt_lmk_image], axis=1)) + + +def landmark_comparison(val_dataset, lmk_dir, gt_lmk_dir): + print("Calculating reprojection error") + lmk_err = [] + + pbar = tqdm(range(len(val_dataset))) + for i in pbar: + # line = val_dataset[i] + # img_name = line["image"].split(".")[0] + lmk1_path = os.path.join(gt_lmk_dir, f"{i}.txt") + lmk1 = np.loadtxt(lmk1_path) + lmk2_path = os.path.join(lmk_dir, f"{i}.txt") + + if not os.path.exists(lmk2_path): + print(f"{lmk2_path} not exist") + continue + + lmk2 = np.loadtxt(lmk2_path) + lmk_err.append(np.mean(np.linalg.norm(lmk1 - lmk2, axis=1))) + pbar.set_description(f"lmk_err: {np.mean(lmk_err):.5f}") + + print("Reprojection error:", np.mean(lmk_err)) + np.save(os.path.join(lmk_dir, "lmk_err.npy"), lmk_err) + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + val_dataset = make_dataset(args, tokenizer, accelerator, "test") + + gt_lmk_dir = os.path.join(args.output_dir, "gt_lmk") + if not os.path.exists(gt_lmk_dir): + os.makedirs(gt_lmk_dir, exist_ok=True) + + pred_lmk_dir = os.path.join(args.output_dir, "pred_lmk") + if not os.path.exists(pred_lmk_dir): + os.makedirs(pred_lmk_dir, exist_ok=True) + + input_dir = os.path.join(args.output_dir, "results") + + generate_landmark2d(val_dataset, input_dir, pred_lmk_dir, gt_lmk_dir, args.vis_overlays) + + if count_txt_files(pred_lmk_dir) == len(val_dataset) and count_txt_files(gt_lmk_dir) == len(val_dataset): + landmark_comparison(val_dataset, pred_lmk_dir, gt_lmk_dir) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/boft_controlnet/eval.sh b/peft/examples/boft_controlnet/eval.sh new file mode 100644 index 0000000000000000000000000000000000000000..d5ed282ea1dab4af179220a4af85873cc6e6db61 --- /dev/null +++ b/peft/examples/boft_controlnet/eval.sh @@ -0,0 +1,29 @@ +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=1 +ITER_NUM=50000 + +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export DATASET_NAME="oftverse/control-celeba-hq" +export CKPT_NAME="checkpoint-${ITER_NUM}" +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}/${CKPT_NAME}" +export CONTROLNET_PATH="${OUTPUT_DIR}/controlnet/model.safetensors" +export UNET_PATH="${OUTPUT_DIR}/unet/${RUN_NAME}" + + +accelerate launch eval.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$DATASET_NAME \ + --controlnet_path=$CONTROLNET_PATH \ + --unet_path=$UNET_PATH \ + --adapter_name=$RUN_NAME \ + --output_dir=$OUTPUT_DIR \ + --dataset_name=$DATASET_NAME \ + --vis_overlays \ + + diff --git a/peft/examples/boft_controlnet/requirements.txt b/peft/examples/boft_controlnet/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b03040914e9d647b604fc6451970cc66502bde48 --- /dev/null +++ b/peft/examples/boft_controlnet/requirements.txt @@ -0,0 +1,10 @@ +datasets==2.16.1 +diffusers==0.34.0 +transformers==4.54.0 +accelerate==1.9.0 +wandb==0.16.1 +scikit-image==0.22.0 +opencv-python==4.9.0.80 +git+https://github.com/1adrianb/face-alignment.git +huggingface_hub==0.34.3 +numpy<2.0.0 \ No newline at end of file diff --git a/peft/examples/boft_controlnet/test_controlnet.py b/peft/examples/boft_controlnet/test_controlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2080deb0a7c59f519673df09909d29075d08a992 --- /dev/null +++ b/peft/examples/boft_controlnet/test_controlnet.py @@ -0,0 +1,134 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +import os +import sys +import time +from pathlib import Path + +import numpy as np +import torch +import torch.utils.checkpoint +from accelerate import Accelerator +from diffusers import DDIMScheduler +from diffusers.utils import check_min_version +from safetensors.torch import load_file +from tqdm import tqdm +from transformers import AutoTokenizer +from utils.args_loader import parse_args +from utils.dataset import make_dataset +from utils.light_controlnet import ControlNetModel +from utils.pipeline_controlnet import LightControlNetPipeline +from utils.unet_2d_condition import UNet2DConditionNewModel + + +sys.path.append("../../src") +from peft import PeftModel # noqa: E402 + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.10.0.dev0") +if torch.xpu.is_available(): + device = "xpu:0" +elif torch.cuda.is_available(): + device = "cuda:0" +else: + device = "cpu" + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + val_dataset = make_dataset(args, tokenizer, accelerator, "test") + + controlnet_path = args.controlnet_path + unet_path = args.unet_path + + controlnet = ControlNetModel() + controlnet.load_state_dict(load_file(controlnet_path)) + unet = UNet2DConditionNewModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet") + unet = PeftModel.from_pretrained(unet, unet_path, adapter_name=args.adapter_name) + + pipe = LightControlNetPipeline.from_pretrained( + args.pretrained_model_name_or_path, + controlnet=controlnet, + unet=unet.model, + torch_dtype=torch.float32, + requires_safety_checker=False, + ).to(device) + + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) + + exist_lst = [int(img.split("_")[-1][:-4]) for img in os.listdir(args.output_dir)] + all_lst = np.arange(len(val_dataset)) + idx_lst = [item for item in all_lst if item not in exist_lst] + + print("Number of images to be processed: ", len(idx_lst)) + + np.random.seed(seed=int(time.time())) + np.random.shuffle(idx_lst) + + for idx in tqdm(idx_lst): + output_path = os.path.join(args.output_dir, f"pred_img_{idx:04d}.png") + + if not os.path.exists(output_path): + data = val_dataset[idx.item()] + negative_prompt = "low quality, blurry, unfinished" + + with torch.no_grad(): + pred_img = pipe( + data["text"], + [data["conditioning_pixel_values"]], + num_inference_steps=50, + guidance_scale=7, + negative_prompt=negative_prompt, + ).images[0] + + pred_img.save(output_path) + + # control_img = Image.fromarray( + # (data["conditioning_pixel_value"] * 255).numpy().transpose(1, 2, 0).astype(np.uint8) + # ) + # gt_img = Image.fromarray( + # ((data["pixel_value"] + 1.0) * 0.5 * 255).numpy().transpose(1, 2, 0).astype(np.uint8) + # ) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/boft_controlnet/test_controlnet.sh b/peft/examples/boft_controlnet/test_controlnet.sh new file mode 100644 index 0000000000000000000000000000000000000000..e8d61e1f09554b5630cbde1fc33ee20431eb3edc --- /dev/null +++ b/peft/examples/boft_controlnet/test_controlnet.sh @@ -0,0 +1,29 @@ +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=1 +ITER_NUM=50000 + +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export DATASET_NAME="oftverse/control-celeba-hq" +export CKPT_NAME="checkpoint-${ITER_NUM}" +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}/${CKPT_NAME}" +export CONTROLNET_PATH="${OUTPUT_DIR}/controlnet/model.safetensors" +export UNET_PATH="${OUTPUT_DIR}/unet" +export RESULTS_PATH="${OUTPUT_DIR}/results" + + +accelerate launch test_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$DATASET_NAME \ + --controlnet_path=$CONTROLNET_PATH \ + --unet_path=$UNET_PATH \ + --adapter_name=$RUN_NAME \ + --output_dir=$RESULTS_PATH \ + --dataset_name=$DATASET_NAME \ + + diff --git a/peft/examples/boft_controlnet/train_controlnet.py b/peft/examples/boft_controlnet/train_controlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f085549060b718a1bc392ff976b5adb47bdaad80 --- /dev/null +++ b/peft/examples/boft_controlnet/train_controlnet.py @@ -0,0 +1,545 @@ +#!/usr/bin/env python +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +import itertools +import logging +import math +import os +from pathlib import Path + +import datasets +import diffusers +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from diffusers import ( + AutoencoderKL, + DDIMScheduler, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from packaging import version +from tqdm.auto import tqdm +from transformers import AutoTokenizer +from utils.args_loader import ( + import_model_class_from_model_name_or_path, + parse_args, +) +from utils.dataset import collate_fn, log_validation, make_dataset +from utils.light_controlnet import ControlNetModel +from utils.tracemalloc import TorchTracemalloc, b2mb +from utils.unet_2d_condition import UNet2DConditionNewModel + +from peft import BOFTConfig, get_peft_model +from peft.peft_model import PeftModel + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.16.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = ["to_q", "to_v", "to_k", "query", "value", "key"] + +TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"] + + +@torch.no_grad() +def save_adaptor(accelerator, output_dir, nets_dict): + for net_key in nets_dict.keys(): + net_model = nets_dict[net_key] + unwarpped_net = accelerator.unwrap_model(net_model) + + if isinstance(unwarpped_net, PeftModel): + unwarpped_net.save_pretrained( + os.path.join(output_dir, net_key), + state_dict=accelerator.get_state_dict(net_model), + safe_serialization=True, + ) + else: + accelerator.save_model( + unwarpped_net, + os.path.join(output_dir, net_key), + safe_serialization=True, + ) + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + + if args.report_to == "wandb": + wandb_init = { + "wandb": { + "name": args.wandb_run_name, + "mode": "online", + } + } + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + + logger.info(accelerator.state, main_process_only=False) + + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionNewModel.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="unet", + revision=args.revision, + ) + + controlnet = ControlNetModel() + + if args.controlnet_model_name_or_path != "": + logger.info(f"Loading existing controlnet weights from {args.controlnet_model_name_or_path}") + controlnet.load_state_dict(torch.load(args.controlnet_model_name_or_path)) + + if args.use_boft: + config = BOFTConfig( + boft_block_size=args.boft_block_size, + boft_block_num=args.boft_block_num, + boft_n_butterfly_factor=args.boft_n_butterfly_factor, + target_modules=UNET_TARGET_MODULES, + boft_dropout=args.boft_dropout, + bias=args.boft_bias, + ) + unet = get_peft_model(unet, config) + unet.print_trainable_parameters() + + vae.requires_grad_(False) + controlnet.requires_grad_(True) + + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + + unet.train() + controlnet.train() + + if args.train_text_encoder and args.use_boft: + config = BOFTConfig( + boft_block_size=args.boft_block_size, + boft_block_num=args.boft_block_num, + boft_n_butterfly_factor=args.boft_n_butterfly_factor, + target_modules=TEXT_ENCODER_TARGET_MODULES, + boft_dropout=args.boft_dropout, + bias=args.boft_bias, + ) + text_encoder = get_peft_model(text_encoder, config, adapter_name=args.wandb_run_name) + text_encoder.print_trainable_parameters() + + if args.train_text_encoder: + text_encoder.train() + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move unet, vae and text_encoder to device and cast to weight_dtype + unet.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=weight_dtype) + controlnet.to(accelerator.device, dtype=weight_dtype) + + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warning("XPU doesn't support xformers yet, xformers is not applied.") + elif is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warning( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) + unet.enable_xformers_memory_efficient_attention() + controlnet.enable_xformers_memory_efficient_attention() + if args.train_text_encoder and not (args.use_lora or args.use_boft or args.use_oft): + text_encoder.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + controlnet.enable_gradient_checkpointing() + unet.enable_gradient_checkpointing() + if args.train_text_encoder and not (args.use_lora or args.use_boft or args.use_oft): + text_encoder.gradient_checkpointing_enable() + + # Check that all trainable models are in full precision + low_precision_error_string = ( + " Please make sure to always have all model weights in full float32 precision when starting training - even if" + " doing mixed precision training, copy of the weights should still be float32." + ) + + if accelerator.unwrap_model(controlnet).dtype != torch.float32: + raise ValueError( + f"Controlnet loaded as datatype {accelerator.unwrap_model(controlnet).dtype}. {low_precision_error_string}" + ) + + if accelerator.unwrap_model(unet).dtype != torch.float32: + raise ValueError( + f"UNet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}" + ) + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + params_to_optimize = [param for param in controlnet.parameters() if param.requires_grad] + params_to_optimize += [param for param in unet.parameters() if param.requires_grad] + + if args.train_text_encoder: + params_to_optimize += [param for param in text_encoder.parameters() if param.requires_grad] + + # Optimizer creation + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Load the dataset + train_dataset = make_dataset(args, tokenizer, accelerator, "train") + val_dataset = make_dataset(args, tokenizer, accelerator, "test") + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + shuffle=True, + collate_fn=collate_fn, + batch_size=args.train_batch_size, + num_workers=args.dataloader_num_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + controlnet, optimizer, train_dataloader, lr_scheduler + ) + + if args.train_text_encoder: + text_encoder = accelerator.prepare(text_encoder) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers(args.wandb_project_name, config=vars(args), init_kwargs=wandb_init) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + if "checkpoint-current" in dirs: + path = "checkpoint-current" + dirs = [d for d in dirs if d.startswith("checkpoint") and d.endswith("0")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + + else: + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + + if path is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + args.resume_from_checkpoint = None + initial_global_step = 0 + else: + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + if path.split("-")[1] == "current": + global_step = int(dirs[-1].split("-")[1]) + else: + global_step = int(path.split("-")[1]) + + initial_global_step = global_step + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = global_step // num_update_steps_per_epoch + resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps) + else: + initial_global_step = 0 + + progress_bar = tqdm( + range(0, args.max_train_steps), + initial=initial_global_step, + desc="Steps", + disable=not accelerator.is_local_main_process, + ) + + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + with TorchTracemalloc() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(controlnet), accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * vae.config.scaling_factor + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype) + + # Get the guided hint for the UNet (320 dim) + guided_hint = controlnet( + controlnet_cond=controlnet_image, + ) + + # Predict the noise residual + model_pred = unet( + noisy_latents, + timesteps, + guided_hint=guided_hint, + encoder_hidden_states=encoder_hidden_states, + ).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(controlnet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else itertools.chain( + controlnet.parameters(), + ) + ) + + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad(set_to_none=args.set_grads_to_none) + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + step_save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + + if accelerator.is_main_process: + if global_step % args.validation_steps == 0 or global_step == 1: + logger.info(f"Running validation... \n Generating {args.num_validation_images} images.") + logger.info("Running validation... ") + + with torch.no_grad(): + log_validation(val_dataset, text_encoder, unet, controlnet, args, accelerator) + + if global_step % args.checkpointing_steps == 0: + save_adaptor(accelerator, step_save_path, {"controlnet": controlnet, "unet": unet}) + + # save text_encoder if any + if args.train_text_encoder: + save_adaptor(accelerator, step_save_path, {"text_encoder": text_encoder}) + + accelerator.save_state(step_save_path) + + logger.info(f"Saved {global_step} state to {step_save_path}") + logger.info(f"Saved current state to {step_save_path}") + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if global_step >= args.max_train_steps: + break + + # Printing the GPU memory usage details such as allocated memory, peak memory, and total memory usage + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + # Create the pipeline using using the trained modules and save it. + accelerator.wait_for_everyone() + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/boft_controlnet/train_controlnet.sh b/peft/examples/boft_controlnet/train_controlnet.sh new file mode 100644 index 0000000000000000000000000000000000000000..efad2c4348e465082ebff7a5dfd4d555dadc6ea2 --- /dev/null +++ b/peft/examples/boft_controlnet/train_controlnet.sh @@ -0,0 +1,42 @@ +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=1 + +export DATASET_NAME="oftverse/control-celeba-hq" +export PROJECT_NAME="controlnet_${PEFT_TYPE}" +export RUN_NAME="${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export CONTROLNET_PATH="" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +export OUTPUT_DIR="./output/${DATASET_NAME}/${RUN_NAME}" + +accelerate launch train_controlnet.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --resume_from_checkpoint=$RESUME_PATH \ + --controlnet_model_name_or_path=$CONTROLNET_PATH \ + --output_dir=$OUTPUT_DIR \ + --report_to="wandb" \ + --dataset_name=$DATASET_NAME \ + --resolution=512 \ + --learning_rate=1e-5 \ + --checkpointing_steps=500 \ + --max_train_steps=50000 \ + --validation_steps=5000 \ + --num_validation_images=12 \ + --train_batch_size=4 \ + --dataloader_num_workers=2 \ + --seed="0" \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --wandb_project_name=$PROJECT_NAME \ + --wandb_run_name=$RUN_NAME \ + --enable_xformers_memory_efficient_attention \ + --use_boft \ + --boft_block_num=$BLOCK_NUM \ + --boft_block_size=$BLOCK_SIZE \ + --boft_n_butterfly_factor=$N_BUTTERFLY_FACTOR \ + --boft_dropout=0.1 \ + --boft_bias="boft_only" \ \ No newline at end of file diff --git a/peft/examples/boft_controlnet/utils/__init__.py b/peft/examples/boft_controlnet/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/peft/examples/boft_controlnet/utils/__init__.py @@ -0,0 +1 @@ + diff --git a/peft/examples/boft_controlnet/utils/args_loader.py b/peft/examples/boft_controlnet/utils/args_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..16e3c9a8ee1cc3199bd99e1e65cc12fdcc98a06c --- /dev/null +++ b/peft/examples/boft_controlnet/utils/args_loader.py @@ -0,0 +1,447 @@ +import argparse +import os +from typing import Optional + +from huggingface_hub import HfFolder, whoami +from transformers import PretrainedConfig + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import ( + RobertaSeriesModelWithTransformation, + ) + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a ControlNet training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--controlnet_model_name_or_path", + type=str, + default=None, + help="Path to pretrained controlnet model or model identifier from huggingface.co/models." + " If not specified controlnet weights are initialized from unet.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help=( + "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be" + " float32 precision." + ), + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--output_dir", + type=str, + default="controlnet-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="The directory where the downloaded models and datasets will be stored.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. " + "In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference." + "Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components." + "See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step" + "instructions." + ), + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=("Max number of checkpoints to store."), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process." + ), + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--wandb_run_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + parser.add_argument( + "--dataset_name", + type=str, + default=None, + help=( + "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," + " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," + " or to a folder containing files that 🤗 Datasets can understand." + ), + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The config of the Dataset, leave as None if there's only one config.", + ) + parser.add_argument( + "--train_data_dir", + type=str, + default=None, + help=( + "A folder containing the training data. Folder contents must follow the structure described in" + " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" + " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." + ), + ) + parser.add_argument( + "--image_column", type=str, default="image", help="The column of the dataset containing the target image." + ) + parser.add_argument( + "--conditioning_image_column", + type=str, + default="conditioning_image", + help="The column of the dataset containing the controlnet conditioning image.", + ) + parser.add_argument( + "--caption_column", + type=str, + default="text", + help="The column of the dataset containing a caption or a list of captions.", + ) + parser.add_argument( + "--max_train_samples", + type=int, + default=None, + help=( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ), + ) + parser.add_argument( + "--proportion_empty_prompts", + type=float, + default=0, + help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).", + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + nargs="+", + help=( + "A set of prompts evaluated every `--validation_steps` and logged to `--report_to`." + " Provide either a matching number of `--validation_image`s, a single `--validation_image`" + " to be used with all prompts, or a single prompt that will be used with all `--validation_image`s." + ), + ) + parser.add_argument( + "--validation_image", + type=str, + default=None, + nargs="+", + help=( + "A set of paths to the controlnet conditioning image be evaluated every `--validation_steps`" + " and logged to `--report_to`. Provide either a matching number of `--validation_prompt`s, a" + " a single `--validation_prompt` to be used with all `--validation_image`s, or a single" + " `--validation_image` that will be used with all `--validation_prompt`s." + ), + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images to be generated for each `--validation_image`, `--validation_prompt` pair", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) + parser.add_argument( + "--tracker_project_name", + type=str, + default="train_controlnet", + help=( + "The `project_name` argument passed to Accelerator.init_trackers for" + " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" + ), + ) + + # evaluation arguments + parser.add_argument("--controlnet_path", type=str, default=None, help="Path to pretrained controlnet.") + parser.add_argument("--unet_path", type=str, default=None, help="Path to pretrained unet.") + parser.add_argument("--adapter_name", type=str, default=None, help="Name of the adapter to use.") + parser.add_argument("--vis_overlays", action="store_true", help="Whether to visualize the landmarks.") + + # self-invented arguments + + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + + parser.add_argument( + "--name", + type=str, + help=("The name of the current experiment run, consists of [data]-[prompt]"), + ) + + # BOFT args + parser.add_argument("--use_boft", action="store_true", help="Whether to use BOFT for parameter efficient tuning") + parser.add_argument("--boft_block_num", type=int, default=8, help="The number of BOFT blocks") + parser.add_argument("--boft_block_size", type=int, default=0, help="The size of BOFT blocks") + parser.add_argument("--boft_n_butterfly_factor", type=int, default=0, help="The number of butterfly factors") + parser.add_argument("--boft_dropout", type=float, default=0.1, help="BOFT dropout, only used if use_boft is True") + parser.add_argument( + "--boft_bias", + type=str, + default="none", + help="Bias type for BOFT. Can be 'none', 'all' or 'boft_only', only used if use_boft is True", + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.dataset_name is None and args.train_data_dir is None: + raise ValueError("Specify either `--dataset_name` or `--train_data_dir`") + + if args.dataset_name is not None and args.train_data_dir is not None: + raise ValueError("Specify only one of `--dataset_name` or `--train_data_dir`") + + if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1: + raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].") + + if args.validation_prompt is not None and args.validation_image is None: + raise ValueError("`--validation_image` must be set if `--validation_prompt` is set") + + if args.validation_prompt is None and args.validation_image is not None: + raise ValueError("`--validation_prompt` must be set if `--validation_image` is set") + + if ( + args.validation_image is not None + and args.validation_prompt is not None + and len(args.validation_image) != 1 + and len(args.validation_prompt) != 1 + and len(args.validation_image) != len(args.validation_prompt) + ): + raise ValueError( + "Must provide either 1 `--validation_image`, 1 `--validation_prompt`," + " or the same number of `--validation_prompt`s and `--validation_image`s" + ) + + if args.resolution % 8 != 0: + raise ValueError( + "`--resolution` must be divisible by 8 for consistently sized encoded images between the VAE and the controlnet encoder." + ) + + return args diff --git a/peft/examples/boft_controlnet/utils/dataset.py b/peft/examples/boft_controlnet/utils/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1de3c8cc3647e434d664f1553bf71921d101ca6a --- /dev/null +++ b/peft/examples/boft_controlnet/utils/dataset.py @@ -0,0 +1,207 @@ +import random + +import numpy as np +import torch +import wandb +from datasets import load_dataset +from diffusers import DDIMScheduler +from PIL import Image +from torchvision import transforms +from utils.pipeline_controlnet import LightControlNetPipeline + + +def image_grid(imgs, rows, cols): + assert len(imgs) == rows * cols + + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +def log_validation(val_dataset, text_encoder, unet, controlnet, args, accelerator): + pipeline = LightControlNetPipeline.from_pretrained( + args.pretrained_model_name_or_path, + controlnet=accelerator.unwrap_model(controlnet, keep_fp32_wrapper=True), + unet=accelerator.unwrap_model(unet, keep_fp32_wrapper=True).model, + text_encoder=accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True), + safety_checker=None, + revision=args.revision, + ) + + pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + + pipeline.set_progress_bar_config(disable=True) + + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + + image_logs = [] + + for idx in range(args.num_validation_images): + data = val_dataset[idx] + validation_prompt = data["text"] + validation_image = data["conditioning_pixel_values"] + + image = pipeline( + validation_prompt, + [validation_image], + num_inference_steps=50, + generator=generator, + )[0][0] + + image_logs.append( + { + "validation_image": validation_image, + "image": image, + "validation_prompt": validation_prompt, + } + ) + + for tracker in accelerator.trackers: + formatted_images = [] + + for log in image_logs: + image = log["image"] + validation_prompt = log["validation_prompt"] + validation_image = log["validation_image"] + + formatted_images.append(wandb.Image(validation_image, caption="Controlnet conditioning")) + + image = wandb.Image(image, caption=validation_prompt) + formatted_images.append(image) + + tracker.log({"validation": formatted_images}) + + del pipeline + torch.cuda.empty_cache() + + +def make_dataset(args, tokenizer, accelerator, split="train"): + # Get the datasets: you can either provide your own training and evaluation files (see below) + # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). + + # In distributed training, the load_dataset function guarantees that only one local process can concurrently + # download the dataset. + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + cache_dir=args.cache_dir, + ) + else: + if args.train_data_dir is not None: + dataset = load_dataset( + args.train_data_dir, + cache_dir=args.cache_dir, + ) + # See more about loading custom images at + # https://huggingface.co/docs/datasets/v2.0.0/en/dataset_script + + # Preprocessing the datasets. + # We need to tokenize inputs and targets. + column_names = dataset[split].column_names + + # Get the column names for input/target. + if args.image_column is None: + image_column = column_names[0] + else: + image_column = args.image_column + if image_column not in column_names: + raise ValueError( + f"`--image_column` value '{args.image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}" + ) + + if args.caption_column is None: + caption_column = column_names[1] + else: + caption_column = args.caption_column + if caption_column not in column_names: + raise ValueError( + f"`--caption_column` value '{args.caption_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}" + ) + + if args.conditioning_image_column is None: + conditioning_image_column = column_names[2] + else: + conditioning_image_column = args.conditioning_image_column + if conditioning_image_column not in column_names: + raise ValueError( + f"`--conditioning_image_column` value '{args.conditioning_image_column}' not found in dataset columns. Dataset columns are: {', '.join(column_names)}" + ) + + def tokenize_captions(examples, is_train=True): + captions = [] + for caption in examples[caption_column]: + if random.random() < args.proportion_empty_prompts: + captions.append("") + elif isinstance(caption, str): + captions.append(caption) + elif isinstance(caption, (list, np.ndarray)): + # take a random caption if there are multiple + captions.append(random.choice(caption) if is_train else caption[0]) + else: + raise ValueError( + f"Caption column `{caption_column}` should contain either strings or lists of strings." + ) + inputs = tokenizer( + captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt" + ) + return inputs.input_ids + + image_transforms = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + conditioning_image_transforms = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution), + transforms.ToTensor(), + ] + ) + + def preprocess_train(examples): + images = [image.convert("RGB") for image in examples[image_column]] + images = [image_transforms(image) for image in images] + + conditioning_images = [image.convert("RGB") for image in examples[conditioning_image_column]] + conditioning_images = [conditioning_image_transforms(image) for image in conditioning_images] + + examples["pixel_values"] = images + examples["conditioning_pixel_values"] = conditioning_images + examples["input_ids"] = tokenize_captions(examples) + + return examples + + with accelerator.main_process_first(): + if args.max_train_samples is not None: + dataset[split] = dataset[split].shuffle(seed=args.seed).select(range(args.max_train_samples)) + # Set the training transforms + split_dataset = dataset[split].with_transform(preprocess_train) + + return split_dataset + + +def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + conditioning_pixel_values = torch.stack([example["conditioning_pixel_values"] for example in examples]) + conditioning_pixel_values = conditioning_pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.stack([example["input_ids"] for example in examples]) + + return { + "pixel_values": pixel_values, + "conditioning_pixel_values": conditioning_pixel_values, + "input_ids": input_ids, + } diff --git a/peft/examples/boft_controlnet/utils/light_controlnet.py b/peft/examples/boft_controlnet/utils/light_controlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..69919e53f7af7bffbec0cdc87362183d0850364c --- /dev/null +++ b/peft/examples/boft_controlnet/utils/light_controlnet.py @@ -0,0 +1,263 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from dataclasses import dataclass +from typing import Optional, Union + +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.unets.unet_2d_blocks import ( + CrossAttnDownBlock2D, + DownBlock2D, +) +from diffusers.utils import BaseOutput, logging +from torch import nn +from torch.nn import functional as F + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class ControlNetOutput(BaseOutput): + down_block_res_samples: tuple[torch.Tensor] + mid_block_res_sample: torch.Tensor + + +class ControlNetConditioningEmbedding(nn.Module): + """ + Quoting from https://huggingface.co/papers/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN + [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized + training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the + convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides + (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full + model) to encode image-space conditions ... into feature maps ..." + """ + + def __init__( + self, + conditioning_embedding_channels: int, + conditioning_channels: int = 3, + block_out_channels: tuple[int] = (16, 32, 96, 256), + ): + super().__init__() + + self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + + self.blocks = nn.ModuleList([]) + + for i in range(len(block_out_channels) - 1): + channel_in = block_out_channels[i] + channel_out = block_out_channels[i + 1] + self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1)) + self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) + + self.conv_out = zero_module( + nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1) + ) + + def forward(self, conditioning): + embedding = self.conv_in(conditioning) + embedding = F.silu(embedding) + + for block in self.blocks: + embedding = block(embedding) + embedding = F.silu(embedding) + + embedding = self.conv_out(embedding) + + return embedding + + +class ControlNetModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + in_channels: int = 4, + out_channels: int = 320, + controlnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[tuple[int]] = (16, 32, 96, 256), + ): + super().__init__() + + # for control image + self.controlnet_cond_embedding = ControlNetConditioningEmbedding( + conditioning_embedding_channels=out_channels, + block_out_channels=conditioning_embedding_out_channels, + ) + + @property + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: dict[str, AttentionProcessor]): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, dict[str, AttentionProcessor]]): + r""" + Parameters: + `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + of **all** `Attention` layers. + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(AttnProcessor()) + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: list[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)): + module.gradient_checkpointing = value + + def forward( + self, + controlnet_cond: torch.FloatTensor, + ) -> Union[ControlNetOutput, tuple]: + # check channel order + channel_order = self.config.controlnet_conditioning_channel_order + + if channel_order == "rgb": + # in rgb order by default + ... + elif channel_order == "bgr": + controlnet_cond = torch.flip(controlnet_cond, dims=[1]) + else: + raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}") + + # 2. pre-process + + controlnet_cond = self.controlnet_cond_embedding(controlnet_cond) + + return controlnet_cond + + +def zero_module(module): + for p in module.parameters(): + nn.init.zeros_(p) + return module diff --git a/peft/examples/boft_controlnet/utils/pipeline_controlnet.py b/peft/examples/boft_controlnet/utils/pipeline_controlnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d4f5f35ed87d31a36a5e149e7bacf2cdcafecddd --- /dev/null +++ b/peft/examples/boft_controlnet/utils/pipeline_controlnet.py @@ -0,0 +1,455 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Callable, Optional, Union + +import numpy as np +import PIL.Image +import torch +from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.controlnet.pipeline_controlnet import StableDiffusionControlNetPipeline +from diffusers.utils import BaseOutput, logging +from torch.nn import functional as F +from utils.light_controlnet import ControlNetModel + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class LightControlNetPipelineOutput(BaseOutput): + """ + Output class for Stable Diffusion pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_content_detected (`List[bool]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, or `None` if safety checking could not be performed. + """ + + images: Union[list[PIL.Image.Image], np.ndarray] + nsfw_content_detected: Optional[list[bool]] + + +class LightControlNetPipeline(StableDiffusionControlNetPipeline): + _optional_components = ["safety_checker", "feature_extractor"] + + def check_inputs( + self, + prompt, + image, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + controlnet_conditioning_scale=1.0, + ): + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # `prompt` needs more sophisticated handling when there are multiple + # conditionings. + if isinstance(self.controlnet, MultiControlNetModel): + if isinstance(prompt, list): + logger.warning( + f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}" + " prompts. The conditionings will be fixed across the prompts." + ) + + # Check `image` + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.controlnet, torch._dynamo.eval_frame.OptimizedModule + ) + + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + self.check_image(image, prompt, prompt_embeds) + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if not isinstance(image, list): + raise TypeError("For multiple controlnets: `image` must be type `list`") + + # When `image` is a nested list: + # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]]) + elif any(isinstance(i, list) for i in image): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif len(image) != len(self.controlnet.nets): + raise ValueError( + "For multiple controlnets: `image` must have the same length as the number of controlnets." + ) + + for image_ in image: + self.check_image(image_, prompt, prompt_embeds) + else: + assert False + + # Check `controlnet_conditioning_scale` + if ( + isinstance(self.controlnet, ControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, ControlNetModel) + ): + if not isinstance(controlnet_conditioning_scale, float): + raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.") + elif ( + isinstance(self.controlnet, MultiControlNetModel) + or is_compiled + and isinstance(self.controlnet._orig_mod, MultiControlNetModel) + ): + if isinstance(controlnet_conditioning_scale, list): + if any(isinstance(i, list) for i in controlnet_conditioning_scale): + raise ValueError("A single batch of multiple conditionings are supported at the moment.") + elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len( + self.controlnet.nets + ): + raise ValueError( + "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have" + " the same length as the number of controlnets" + ) + else: + assert False + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, list[str]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + list[torch.FloatTensor], + list[PIL.Image.Image], + list[np.ndarray], + ] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, list[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, list[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, list[float]] = 1.0, + guess_mode: bool = False, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If + the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If + height and/or width are passed, `image` is resized according to them. If multiple ControlNets are + specified in init, images must be passed as a list such that each element of the list can be correctly + batched for input to a single controlnet. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://huggingface.co/papers/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original unet. If multiple ControlNets are specified in init, you can set the + corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + In this mode, the ControlNet encoder will try best to recognize the content of the input image even if + you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + image, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + controlnet_conditioning_scale, + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + controlnet = self.controlnet._orig_mod if hasattr(self.controlnet, "_orig_mod") else self.controlnet + + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. Prepare image + if isinstance(controlnet, ControlNetModel): + image = self.prepare_image( + image=image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, + ) + height, width = image.shape[-2:] + elif isinstance(controlnet, MultiControlNetModel): + images = [] + + for image_ in image: + image_ = self.prepare_image( + image=image_, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, + ) + + images.append(image_) + + image = images + height, width = image[0].shape[-2:] + else: + assert False + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 6. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # controlnet(s) inference + if guess_mode and do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + else: + control_model_input = latent_model_input + + # Get the guided hint for the UNet (320 dim) + guided_hint = self.controlnet( + controlnet_cond=image, + ) + + # Predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + guided_hint=guided_hint, + encoder_hidden_states=prompt_embeds, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # If we do sequential model offloading, let's offload unet and controlnet + # manually for max memory savings + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + self.controlnet.to("cpu") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return LightControlNetPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/peft/examples/boft_controlnet/utils/tracemalloc.py b/peft/examples/boft_controlnet/utils/tracemalloc.py new file mode 100644 index 0000000000000000000000000000000000000000..26ba300926002d797832ab8dae12053ed6fcd4cf --- /dev/null +++ b/peft/examples/boft_controlnet/utils/tracemalloc.py @@ -0,0 +1,60 @@ +import gc +import threading + +import psutil +import torch + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + gc.collect() + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") diff --git a/peft/examples/boft_controlnet/utils/unet_2d_condition.py b/peft/examples/boft_controlnet/utils/unet_2d_condition.py new file mode 100644 index 0000000000000000000000000000000000000000..3e68fe287dd4efd564cbb2320056084b87a01ef3 --- /dev/null +++ b/peft/examples/boft_controlnet/utils/unet_2d_condition.py @@ -0,0 +1,277 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Optional, Union + +import torch +from diffusers.models import UNet2DConditionModel +from diffusers.utils import BaseOutput, logging + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet2DConditionOutput(BaseOutput): + """ + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: torch.FloatTensor + + +class UNet2DConditionNewModel(UNet2DConditionModel): + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + guided_hint: Optional[torch.Tensor] = None, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[dict[str, Any]] = None, + added_cond_kwargs: Optional[dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, tuple]: + r""" + Args: + sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor + timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps + encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + encoder_attention_mask (`torch.Tensor`): + (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False = + discard. Mask will be converted into a bias, which adds large negative values to attention scores + corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + added_cond_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time + embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and + `addition_embed_type` for more information. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + + class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype) + + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + emb = emb + aug_emb + elif self.config.addition_embed_type == "text_image": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + + aug_emb = self.add_embedding(text_embs, image_embs) + emb = emb + aug_emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) + + # 2. pre-process and insert conditioning (ControlNet) + # Note: the added "guided_hint" is the only difference between this implementation and the original UNet2DConditionModel + sample = self.conv_in(sample) + sample = guided_hint + sample if guided_hint is not None else sample + + # 3. down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) + + down_block_res_samples += res_samples + + if down_block_additional_residuals is not None: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + + if mid_block_additional_residual is not None: + sample = sample + mid_block_additional_residual + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = upsample_block( + hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if not return_dict: + return (sample,) + + return UNet2DConditionOutput(sample=sample) diff --git a/peft/examples/boft_dreambooth/.gitignore b/peft/examples/boft_dreambooth/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..8fce603003c1e5857013afec915ace9fc8bcdb8d --- /dev/null +++ b/peft/examples/boft_dreambooth/.gitignore @@ -0,0 +1 @@ +data/ diff --git a/peft/examples/boft_dreambooth/__init__.py b/peft/examples/boft_dreambooth/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/examples/boft_dreambooth/boft_dreambooth.md b/peft/examples/boft_dreambooth/boft_dreambooth.md new file mode 100644 index 0000000000000000000000000000000000000000..dd5f8203ef0669a763078468dd41461e78e93b6e --- /dev/null +++ b/peft/examples/boft_dreambooth/boft_dreambooth.md @@ -0,0 +1,176 @@ + + +# DreamBooth fine-tuning with BOFT + +This guide demonstrates how to use BOFT, an orthogonal fine-tuning method, to fine-tune Dreambooth with either `stabilityai/stable-diffusion-2-1` or `runwayml/stable-diffusion-v1-5` model. + +By using BOFT from 🤗 PEFT, we can significantly reduce the number of trainable parameters while still achieving impressive results in various fine-tuning tasks across different foundation models. BOFT enhances model efficiency by integrating full-rank orthogonal matrices with a butterfly structure into specific model blocks, such as attention blocks, mirroring the approach used in LoRA. During fine-tuning, only these inserted matrices are trained, leaving the original model parameters untouched. During inference, the trainable BOFT parameters can be merged into the original model, eliminating any additional computational costs. + +As a member of the **orthogonal finetuning** class, BOFT presents a systematic and principled method for fine-tuning. It possesses several unique properties and has demonstrated superior performance compared to LoRA in a variety of scenarios. For further details on BOFT, please consult the [PEFT's GitHub repo's concept guide OFT](https://https://huggingface.co/docs/peft/index), the [original BOFT paper](https://huggingface.co/papers/2311.06243) and the [original OFT paper](https://huggingface.co/papers/2306.07280). + +In this guide we provide a Dreambooth fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth). This implementation is adapted from [peft's lora_dreambooth](https://github.com/huggingface/peft/tree/main/examples/lora_dreambooth). You can try it out and finetune on your custom images. + +## Set up your environment + +Start by cloning the PEFT repository: + +```bash +git clone --recursive https://github.com/huggingface/peft +``` + +Navigate to the directory containing the training scripts for fine-tuning Dreambooth with BOFT: + +```bash +cd peft/examples/boft_dreambooth +``` + +Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. The following environment setup should work on A100 and H100: + +### CUDA +```bash +conda create --name peft python=3.10 +conda activate peft +conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia +conda install xformers -c xformers +pip install -r requirements.txt +pip install git+https://github.com/huggingface/peft +``` +The follwing environment setuo is validated work on Intel XPU: + +### Intel XPU +```bash +conda create --name peft python=3.10 +conda activate peft +pip install pip install torch==2.8.0.dev20250615+xpu torchvision==0.23.0.dev20250615+xpu torchaudio==2.8.0.dev20250615+xpu --index-url https://download.pytorch.org/whl/nightly/xpu --no-cache-dir +pip install -r requirements.txt +pip install git+https://github.com/huggingface/peft +``` + +## Download the data + +[dreambooth](https://github.com/google/dreambooth) dataset should have been automatically cloned in the following structure when running the training script. + +``` +boft_dreambooth +├── data +│ ├── data_dir +│ └── dreambooth +│ └── data +│ ├── backpack +│ └── backpack_dog +│ ... +``` + +You can also put your custom images into `boft_dreambooth/data/dreambooth`. + +## Finetune Dreambooth with BOFT + +```bash +./train_dreambooth.sh +``` + +or using the following script arguments: + +```bash +export MODEL_NAME="runwayml/stable-diffusion-v1-5" +export INSTANCE_DIR="path-to-instance-images" +export CLASS_DIR="path-to-class-images" +export OUTPUT_DIR="path-to-save-model" +``` + +Here: + +- `INSTANCE_DIR`: The directory containing the images that you intend to use for training your model. +- `CLASS_DIR`: The directory containing class-specific images. In this example, we use prior preservation to avoid overfitting and language-drift. For prior preservation, you need other images of the same class as part of the training process. However, these images can be generated and the training script will save them to a local path you specify here. +- `OUTPUT_DIR`: The destination folder for storing the trained model's weights. + +To learn more about DreamBooth fine-tuning with prior-preserving loss, check out the [Diffusers documentation](https://huggingface.co/docs/diffusers/training/dreambooth#finetuning-with-priorpreserving-loss). + +Launch the training script with `accelerate` and pass hyperparameters, as well as LoRa-specific arguments to it such as: + +- `use_boft`: Enables BOFT in the training script. +- `boft_block_size`: the BOFT matrix block size across different layers, expressed in `int`. Smaller block size results in sparser update matrices with fewer trainable parameters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension. +- `boft_block_num`: the number of BOFT matrix blocks across different layers, expressed in `int`. Fewer blocks result in sparser update matrices with fewer trainable parameters. **Note**, please choose it to be dividable to most layer `in_features` dimension, e.g., 4, 8, 16. Also, you can only specify either `boft_block_size` or `boft_block_num`, but not both simultaneously, because `boft_block_size` x `boft_block_num` = layer dimension. +- `boft_n_butterfly_factor`: the number of butterfly factors. **Note**, for `boft_n_butterfly_factor=1`, BOFT is the same as vanilla OFT, for `boft_n_butterfly_factor=2`, the effective block size of OFT becomes twice as big and the number of blocks becomes half. +- `bias`: specify if the `bias` parameters should be trained. Can be `none`, `all` or `boft_only`. +- `boft_dropout`: specify the probability of multiplicative dropout. + +Here's what the full set of script arguments may look like: + +```bash +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=1 + +VALIDATION_PROMPT=${PROMPT_LIST[@]} +INSTANCE_PROMPT="a photo of ${UNIQUE_TOKEN} ${CLASS_TOKEN}" +CLASS_PROMPT="a photo of ${CLASS_TOKEN}" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" +export PROJECT_NAME="dreambooth_${PEFT_TYPE}" +export RUN_NAME="${SELECTED_SUBJECT}_${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export INSTANCE_DIR="./data/dreambooth/dataset/${SELECTED_SUBJECT}" +export CLASS_DIR="./data/class_data/${CLASS_TOKEN}" +export OUTPUT_DIR="./data/output/${PEFT_TYPE}" + + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir="$CLASS_DIR" \ + --output_dir=$OUTPUT_DIR \ + --wandb_project_name=$PROJECT_NAME \ + --wandb_run_name=$RUN_NAME \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="$INSTANCE_PROMPT" \ + --validation_prompt="$VALIDATION_PROMPT" \ + --class_prompt="$CLASS_PROMPT" \ + --resolution=512 \ + --train_batch_size=1 \ + --num_dataloader_workers=2 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --use_boft \ + --boft_block_num=$BLOCK_NUM \ + --boft_block_size=$BLOCK_SIZE \ + --boft_n_butterfly_factor=$N_BUTTERFLY_FACTOR \ + --boft_dropout=0.1 \ + --boft_bias="boft_only" \ + --learning_rate=3e-5 \ + --max_train_steps=1010 \ + --checkpointing_steps=200 \ + --validation_steps=200 \ + --enable_xformers_memory_efficient_attention \ + --report_to="wandb" \ +``` + +or use this training script: + +```bash +./train_dreambooth.sh $idx +``` + +with the `$idx` corresponds to different subjects. + +If you are running this script on Windows, you may need to set the `--num_dataloader_workers` to 0. + +## Inference with a single adapter + +To run inference with the fine-tuned model, simply run the jupyter notebook `dreambooth_inference.ipynb` for visualization with `jupyter notebook` under `./examples/boft_dreambooth`. diff --git a/peft/examples/boft_dreambooth/dreambooth_inference.ipynb b/peft/examples/boft_dreambooth/dreambooth_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3fa2bbbf3c933eea891c710e79c29a9ffdfffde1 --- /dev/null +++ b/peft/examples/boft_dreambooth/dreambooth_inference.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "acab479f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import torch\n", + "from accelerate.logging import get_logger\n", + "from diffusers import StableDiffusionPipeline\n", + "from diffusers.utils import check_min_version\n", + "\n", + "from peft import PeftModel\n", + "\n", + "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n", + "check_min_version(\"0.10.0.dev0\")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "MODEL_NAME = \"stabilityai/stable-diffusion-2-1\"\n", + "# MODEL_NAME=\"runwayml/stable-diffusion-v1-5\"\n", + "\n", + "PEFT_TYPE=\"boft\"\n", + "BLOCK_NUM=8\n", + "BLOCK_SIZE=0\n", + "N_BUTTERFLY_FACTOR=1\n", + "SELECTED_SUBJECT=\"backpack\"\n", + "EPOCH_IDX = 200\n", + "\n", + "PROJECT_NAME=f\"dreambooth_{PEFT_TYPE}\"\n", + "RUN_NAME=f\"{SELECTED_SUBJECT}_{PEFT_TYPE}_{BLOCK_NUM}{BLOCK_SIZE}{N_BUTTERFLY_FACTOR}\"\n", + "OUTPUT_DIR=f\"./data/output/{PEFT_TYPE}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06cfd506", + "metadata": {}, + "outputs": [], + "source": [ + "def get_boft_sd_pipeline(\n", + " ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"auto\", adapter_name=\"default\"\n", + "):\n", + " if device == \"auto\":\n", + " device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + " if base_model_name_or_path is None:\n", + " raise ValueError(\"Please specify the base model name or path\")\n", + "\n", + " pipe = StableDiffusionPipeline.from_pretrained(\n", + " base_model_name_or_path, torch_dtype=dtype, requires_safety_checker=False\n", + " ).to(device)\n", + " \n", + " load_adapter(pipe, ckpt_dir, epoch, adapter_name)\n", + "\n", + " if dtype in (torch.float16, torch.bfloat16):\n", + " pipe.unet.half()\n", + " pipe.text_encoder.half()\n", + "\n", + " pipe.to(device)\n", + " return pipe\n", + "\n", + "\n", + "def load_adapter(pipe, ckpt_dir, epoch, adapter_name=\"default\"):\n", + " \n", + " unet_sub_dir = os.path.join(ckpt_dir, f\"unet/{epoch}\", adapter_name)\n", + " text_encoder_sub_dir = os.path.join(ckpt_dir, f\"text_encoder/{epoch}\", adapter_name)\n", + " \n", + " if isinstance(pipe.unet, PeftModel):\n", + " pipe.unet.load_adapter(unet_sub_dir, adapter_name=adapter_name)\n", + " else:\n", + " pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)\n", + " \n", + " if os.path.exists(text_encoder_sub_dir):\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.load_adapter(text_encoder_sub_dir, adapter_name=adapter_name)\n", + " else:\n", + " pipe.text_encoder = PeftModel.from_pretrained(pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name)\n", + " \n", + "\n", + "def set_adapter(pipe, adapter_name):\n", + " pipe.unet.set_adapter(adapter_name)\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.set_adapter(adapter_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98a0d8ac", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"a photo of sks backpack on a wooden floor\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e888d2", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pipe = get_boft_sd_pipeline(OUTPUT_DIR, MODEL_NAME, EPOCH_IDX, adapter_name=RUN_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c1a1c0", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a1aafdf-8cf7-4e47-9471-26478034245e", + "metadata": {}, + "outputs": [], + "source": [ + "# load and reset another adapter\n", + "# WARNING: requires training DreamBooth with `boft_bias=None`\n", + "\n", + "SELECTED_SUBJECT=\"dog\"\n", + "EPOCH_IDX = 200\n", + "RUN_NAME=f\"{SELECTED_SUBJECT}_{PEFT_TYPE}_{BLOCK_NUM}{BLOCK_SIZE}{N_BUTTERFLY_FACTOR}\"\n", + "\n", + "load_adapter(pipe, OUTPUT_DIR, epoch=EPOCH_IDX, adapter_name=RUN_NAME)\n", + "set_adapter(pipe, adapter_name=RUN_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7091ad0-2005-4528-afc1-4f9d70a9a535", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "prompt = \"a photo of sks dog running on the beach\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:peft] *", + "language": "python", + "name": "conda-env-peft-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/boft_dreambooth/requirements.txt b/peft/examples/boft_dreambooth/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..67eac706b928821dd8ba69cdf71d9bf3bfd4a49d --- /dev/null +++ b/peft/examples/boft_dreambooth/requirements.txt @@ -0,0 +1,13 @@ +transformers==4.54.0 +accelerate==1.9.0 +evaluate +tqdm +datasets==4.0.0 +diffusers==0.34.0 +Pillow +huggingface_hub +safetensors +nb_conda_kernels +ipykernel +ipywidgets +wandb==0.21.0 diff --git a/peft/examples/boft_dreambooth/train_dreambooth.py b/peft/examples/boft_dreambooth/train_dreambooth.py new file mode 100644 index 0000000000000000000000000000000000000000..0416328ff114ce42db1ab7eac2be48fbc3709224 --- /dev/null +++ b/peft/examples/boft_dreambooth/train_dreambooth.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +import hashlib +import itertools +import logging +import math +import os +from contextlib import nullcontext +from pathlib import Path + +import datasets +import diffusers +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import Repository +from tqdm.auto import tqdm +from transformers import AutoTokenizer +from utils.args_loader import ( + get_full_repo_name, + import_model_class_from_model_name_or_path, + parse_args, +) +from utils.dataset import DreamBoothDataset, PromptDataset, collate_fn +from utils.tracemalloc import TorchTracemalloc, b2mb + +from peft import BOFTConfig, get_peft_model + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.16.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = ["to_q", "to_v", "to_k", "query", "value", "key", "to_out.0", "add_k_proj", "add_v_proj"] +TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"] + + +def save_adaptor(accelerator, step, unet, text_encoder, args): + unwarpped_unet = accelerator.unwrap_model(unet) + unwarpped_unet.save_pretrained( + os.path.join(args.output_dir, f"unet/{step}"), state_dict=accelerator.get_state_dict(unet) + ) + if args.train_text_encoder: + unwarpped_text_encoder = accelerator.unwrap_model(text_encoder) + unwarpped_text_encoder.save_pretrained( + os.path.join(args.output_dir, f"text_encoder/{step}"), + state_dict=accelerator.get_state_dict(text_encoder), + ) + + +def main(args): + validation_prompts = list(filter(None, args.validation_prompt[0].split("."))) + + logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=accelerator_project_config, + ) + if args.report_to == "wandb": + import wandb + + wandb_init = { + "wandb": { + "name": args.wandb_run_name, + "mode": "online", + } + } + + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + global_seed = hash(args.wandb_run_name) % (2**32) + set_seed(global_seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + repo = Repository(args.output_dir, clone_from=repo_name) # noqa: F841 + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + if args.use_boft: + config = BOFTConfig( + boft_block_size=args.boft_block_size, + boft_block_num=args.boft_block_num, + boft_n_butterfly_factor=args.boft_n_butterfly_factor, + target_modules=UNET_TARGET_MODULES, + boft_dropout=args.boft_dropout, + bias=args.boft_bias, + ) + unet = get_peft_model(unet, config, adapter_name=args.wandb_run_name) + unet.print_trainable_parameters() + + vae.requires_grad_(False) + unet.train() + + if args.train_text_encoder and args.use_boft: + config = BOFTConfig( + boft_block_size=args.boft_block_size, + boft_block_num=args.boft_block_num, + boft_n_butterfly_factor=args.boft_n_butterfly_factor, + target_modules=TEXT_ENCODER_TARGET_MODULES, + boft_dropout=args.boft_dropout, + bias=args.boft_bias, + ) + text_encoder = get_peft_model(text_encoder, config, adapter_name=args.wandb_run_name) + text_encoder.print_trainable_parameters() + text_encoder.train() + else: + text_encoder.requires_grad_(False) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move unet, vae and text_encoder to device and cast to weight_dtype + unet.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=weight_dtype) + text_encoder.to(accelerator.device, dtype=weight_dtype) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warn("XPU hasn't support xformers yet, ignore it.") + elif is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + # below fails when using boft so commenting it out + if args.train_text_encoder and not args.use_boft: + text_encoder.gradient_checkpointing_enable() + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32 and torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = [param for param in unet.parameters() if param.requires_grad] + + if args.train_text_encoder: + params_to_optimize += [param for param in text_encoder.parameters() if param.requires_grad] + + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Download the official dreambooth dataset from the official repository: https://github.com/google/dreambooth.git + data_path = os.path.join(os.getcwd(), "data", "dreambooth") + if not os.path.exists(data_path): + os.makedirs(os.path.join(os.getcwd(), "data"), exist_ok=True) + os.system(f"git clone https://github.com/google/dreambooth.git '{data_path}'") + + # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.num_dataloader_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers(args.wandb_project_name, config=vars(args), init_kwargs=wandb_init) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = resume_global_step // num_update_steps_per_epoch + resume_step = resume_global_step % num_update_steps_per_epoch + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + if args.train_text_encoder: + text_encoder.train() + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + + with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * vae.config.scaling_factor + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + if global_step % args.checkpointing_steps == 0 and global_step != 0: + if accelerator.is_main_process: + save_adaptor(accelerator, global_step, unet, text_encoder, args) + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if ( + args.validation_prompt is not None + and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0 + and global_step > 10 + ): + unet.eval() + + logger.info( + f"Running validation... \n Generating {len(validation_prompts)} images with prompt:" + f" {validation_prompts[0]}, ......" + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + revision=args.revision, + ) + # set `keep_fp32_wrapper` to True because we do not want to remove + # mixed precision hooks while we are still training + pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True) + pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + if args.seed is not None: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + else: + generator = None + # images = [] + # for _ in range(args.num_validation_images): + # image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] + # images.append(image) + + images = [] + val_img_dir = os.path.join( + args.output_dir, + f"validation/{global_step}", + args.wandb_run_name, + ) + os.makedirs(val_img_dir, exist_ok=True) + + for val_promot in validation_prompts: + image = pipeline(val_promot, num_inference_steps=50, generator=generator).images[0] + image.save(os.path.join(val_img_dir, f"{'_'.join(val_promot.split(' '))}.png"[1:])) + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + import wandb + + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {validation_prompts[i]}") + for i, image in enumerate(images) + ] + } + ) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + if global_step >= args.max_train_steps: + break + + # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage + if not args.no_tracemalloc: + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + if args.push_to_hub: + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/boft_dreambooth/train_dreambooth.sh b/peft/examples/boft_dreambooth/train_dreambooth.sh new file mode 100644 index 0000000000000000000000000000000000000000..f886a4fd1ddf5823213d755da5a6ac5810cd3463 --- /dev/null +++ b/peft/examples/boft_dreambooth/train_dreambooth.sh @@ -0,0 +1,191 @@ +IDX=$1 +PROMPT_IDX=$((IDX % 25)) +CLASS_IDX=$((IDX % 30)) + +# Define the UNIQUE_TOKEN, CLASS_TOKENs, and SUBJECT_NAMES +UNIQUE_TOKEN="qwe" + +SUBJECT_NAMES=( + "backpack" "backpack_dog" "bear_plushie" "berry_bowl" "can" + "candle" "cat" "cat2" "clock" "colorful_sneaker" + "dog" "dog2" "dog3" "dog5" "dog6" + "dog7" "dog8" "duck_toy" "fancy_boot" "grey_sloth_plushie" + "monster_toy" "pink_sunglasses" "poop_emoji" "rc_car" "red_cartoon" + "robot_toy" "shiny_sneaker" "teapot" "vase" "wolf_plushie" +) + +CLASS_TOKENs=( + "backpack" "backpack" "stuffed animal" "bowl" "can" + "candle" "cat" "cat" "clock" "sneaker" + "dog" "dog" "dog" "dog" "dog" + "dog" "dog" "toy" "boot" "stuffed animal" + "toy" "glasses" "toy" "toy" "cartoon" + "toy" "sneaker" "teapot" "vase" "stuffed animal" +) + +CLASS_TOKEN=${CLASS_TOKENs[$CLASS_IDX]} +SELECTED_SUBJECT=${SUBJECT_NAMES[$CLASS_IDX]} + +if [[ $CLASS_IDX =~ ^(0|1|2|3|4|5|8|9|17|18|19|20|21|22|23|24|25|26|27|28|29)$ ]]; then + PROMPT_LIST=( + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a wheat field in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a tree and autumn leaves in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with the Eiffel Tower in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating on top of water." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating in an ocean of milk." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of green grass with sunflowers around it." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a mirror." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of the sidewalk in a crowded street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a dirt road." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a white rug." + "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + ) + + prompt_test_list=( + "a ${CLASS_TOKEN} in the jungle" + "a ${CLASS_TOKEN} in the snow" + "a ${CLASS_TOKEN} on the beach" + "a ${CLASS_TOKEN} on a cobblestone street" + "a ${CLASS_TOKEN} on top of pink fabric" + "a ${CLASS_TOKEN} on top of a wooden floor" + "a ${CLASS_TOKEN} with a city in the background" + "a ${CLASS_TOKEN} with a mountain in the background" + "a ${CLASS_TOKEN} with a blue house in the background" + "a ${CLASS_TOKEN} on top of a purple rug in a forest" + "a ${CLASS_TOKEN} with a wheat field in the background" + "a ${CLASS_TOKEN} with a tree and autumn leaves in the background" + "a ${CLASS_TOKEN} with the Eiffel Tower in the background" + "a ${CLASS_TOKEN} floating on top of water" + "a ${CLASS_TOKEN} floating in an ocean of milk" + "a ${CLASS_TOKEN} on top of green grass with sunflowers around it" + "a ${CLASS_TOKEN} on top of a mirror" + "a ${CLASS_TOKEN} on top of the sidewalk in a crowded street" + "a ${CLASS_TOKEN} on top of a dirt road" + "a ${CLASS_TOKEN} on top of a white rug" + "a red ${CLASS_TOKEN}" + "a purple ${CLASS_TOKEN}" + "a shiny ${CLASS_TOKEN}" + "a wet ${CLASS_TOKEN}" + "a cube shaped ${CLASS_TOKEN}" + ) + +else + PROMPT_LIST=( + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a red hat." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a santa hat." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a rainbow scarf." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a black top hat and a monocle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a chef outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a firefighter outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a police outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing pink glasses." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a yellow shirt." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a purple wizard outfit." + "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + ) + + prompt_test_list=( + "a ${CLASS_TOKEN} in the jungle" + "a ${CLASS_TOKEN} in the snow" + "a ${CLASS_TOKEN} on the beach" + "a ${CLASS_TOKEN} on a cobblestone street" + "a ${CLASS_TOKEN} on top of pink fabric" + "a ${CLASS_TOKEN} on top of a wooden floor" + "a ${CLASS_TOKEN} with a city in the background" + "a ${CLASS_TOKEN} with a mountain in the background" + "a ${CLASS_TOKEN} with a blue house in the background" + "a ${CLASS_TOKEN} on top of a purple rug in a forest" + "a ${CLASS_TOKEN} wearing a red hat" + "a ${CLASS_TOKEN} wearing a santa hat" + "a ${CLASS_TOKEN} wearing a rainbow scarf" + "a ${CLASS_TOKEN} wearing a black top hat and a monocle" + "a ${CLASS_TOKEN} in a chef outfit" + "a ${CLASS_TOKEN} in a firefighter outfit" + "a ${CLASS_TOKEN} in a police outfit" + "a ${CLASS_TOKEN} wearing pink glasses" + "a ${CLASS_TOKEN} wearing a yellow shirt" + "a ${CLASS_TOKEN} in a purple wizard outfit" + "a red ${CLASS_TOKEN}" + "a purple ${CLASS_TOKEN}" + "a shiny ${CLASS_TOKEN}" + "a wet ${CLASS_TOKEN}" + "a cube shaped ${CLASS_TOKEN}" + ) +fi + +VALIDATION_PROMPT=${PROMPT_LIST[@]} +INSTANCE_PROMPT="a photo of ${UNIQUE_TOKEN} ${CLASS_TOKEN}" +CLASS_PROMPT="a photo of ${CLASS_TOKEN}" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" +# export MODEL_NAME="runwayml/stable-diffusion-v1-5" + +PEFT_TYPE="boft" +BLOCK_NUM=8 +BLOCK_SIZE=0 +N_BUTTERFLY_FACTOR=1 + +export PROJECT_NAME="dreambooth_${PEFT_TYPE}" +export RUN_NAME="${SELECTED_SUBJECT}_${PEFT_TYPE}_${BLOCK_NUM}${BLOCK_SIZE}${N_BUTTERFLY_FACTOR}" +export INSTANCE_DIR="./data/dreambooth/dataset/${SELECTED_SUBJECT}" +export CLASS_DIR="./data/class_data/${CLASS_TOKEN}" +export OUTPUT_DIR="./data/output/${PEFT_TYPE}" + + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir="$CLASS_DIR" \ + --output_dir=$OUTPUT_DIR \ + --wandb_project_name=$PROJECT_NAME \ + --wandb_run_name=$RUN_NAME \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="$INSTANCE_PROMPT" \ + --validation_prompt="$VALIDATION_PROMPT" \ + --class_prompt="$CLASS_PROMPT" \ + --resolution=512 \ + --train_batch_size=1 \ + --num_dataloader_workers=2 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --use_boft \ + --boft_block_num=$BLOCK_NUM \ + --boft_block_size=$BLOCK_SIZE \ + --boft_n_butterfly_factor=$N_BUTTERFLY_FACTOR \ + --boft_dropout=0.1 \ + --boft_bias="boft_only" \ + --learning_rate=3e-5 \ + --max_train_steps=1010 \ + --checkpointing_steps=200 \ + --validation_steps=200 \ + --enable_xformers_memory_efficient_attention \ + --report_to="wandb" \ \ No newline at end of file diff --git a/peft/examples/boft_dreambooth/utils/__init__.py b/peft/examples/boft_dreambooth/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/examples/boft_dreambooth/utils/args_loader.py b/peft/examples/boft_dreambooth/utils/args_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..dd946e20f52155a3c63ddd5e32104f5fc79185b3 --- /dev/null +++ b/peft/examples/boft_dreambooth/utils/args_loader.py @@ -0,0 +1,363 @@ +import argparse +import os +import warnings +from typing import Optional + +from huggingface_hub import HfFolder, whoami +from transformers import PretrainedConfig + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a Dreambooth training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--validation_prompt", + nargs="+", + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=500, + help=( + "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + + # boft args + parser.add_argument("--use_boft", action="store_true", help="Whether to use BOFT for parameter efficient tuning") + parser.add_argument("--boft_block_num", type=int, default=4, help="The number of BOFT blocks") + parser.add_argument("--boft_block_size", type=int, default=0, help="The size of BOFT blocks") + parser.add_argument("--boft_n_butterfly_factor", type=int, default=2, help="The number of butterfly factors") + parser.add_argument("--boft_dropout", type=float, default=0.1, help="BOFT dropout, only used if use_boft is True") + parser.add_argument( + "--boft_bias", + type=str, + default="none", + help="Bias type for BOFT. Can be 'none', 'all' or 'boft_only', only used if use_boft is True", + ) + parser.add_argument( + "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader." + ) + parser.add_argument( + "--no_tracemalloc", + default=False, + action="store_true", + help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.", + ) + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--wandb_run_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + # Sanity checks + # if args.dataset_name is None and args.train_data_dir is None: + # raise ValueError("Need either a dataset name or a training folder.") + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args diff --git a/peft/examples/boft_dreambooth/utils/dataset.py b/peft/examples/boft_dreambooth/utils/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7a968705cf495906ace19f73c257bfcb4255c9ad --- /dev/null +++ b/peft/examples/boft_dreambooth/utils/dataset.py @@ -0,0 +1,126 @@ +from pathlib import Path + +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def collate_fn(examples, with_prior_preservation=False): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example diff --git a/peft/examples/boft_dreambooth/utils/tracemalloc.py b/peft/examples/boft_dreambooth/utils/tracemalloc.py new file mode 100644 index 0000000000000000000000000000000000000000..26ba300926002d797832ab8dae12053ed6fcd4cf --- /dev/null +++ b/peft/examples/boft_dreambooth/utils/tracemalloc.py @@ -0,0 +1,60 @@ +import gc +import threading + +import psutil +import torch + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + gc.collect() + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") diff --git a/peft/examples/bone_finetuning/README.md b/peft/examples/bone_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e303d8e5d25d49cff10ab9df56fba5aa31ed6be5 --- /dev/null +++ b/peft/examples/bone_finetuning/README.md @@ -0,0 +1,96 @@ +# DiSHA: Dimension-Sharding Adaptation with Fast Convergence and Fast Computation +## Introduction ([Paper](https://huggingface.co/papers/2409.15371), [code](https://github.com/JL-er/DiSHA)) +Low-Rank Adaptation (LoRA) leverages the low intrinsic rank of weight updates in Large Language Models (LLMs), establishing a Parameter-Efficient Fine-Tuning (PEFT) paradigm. However, LoRA suffers from slow convergence. We introduce Dimension-Sharding Adaptation (DiSHA), which expands the PEFT design space to unlock lower intrinsic ranks and faster convergence by default. Within DiSHA's design space, we propose Block Affine Adaptation (Bone), a computationally efficient structure that delivers both high performance and efficiency. While certain DiSHA configurations may result in colinear updates to weight shards, we address this with Block Affine Transformation Adaptation (BAT), a nonlinear variant of DiSHA. BAT introduces nonlinearity by combining trainable matrices with original weight shards in a nonlinear manner, inducing nonlinearity in matrix updates without introducing additional parameters. Empirical results show that Bone, under the DiSHA framework, consistently outperforms LoRA variants in both NLG and NLU tasks, with significantly improved computational efficiency. Further analysis demonstrates that BAT enhances model capabilities by leveraging its nonlinear design. + + +## Quick Start +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +tokenizer.pad_token_id = tokenizer.eos_token_id +bone_config = BoneConfig( + r = 64 +) +#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat". +# bone_config = BoneConfig( +# r = 64, +# init_weights="bat" +# ) +peft_model = get_peft_model(model, bone_config) + +peft_model.print_trainable_parameters() + +dataset = load_dataset("imdb", split="train[:1%]") + +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + args=training_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("bone-llama-2-7b") +``` + + +To utilize the fine-tuned Bone modules, simply run the following command: +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto" +) +peft_model = PeftModel.from_pretrained(model, "bone-llama-2-7b") +``` + +## Advanced Usage + +### Fine-tune +```shell +#Bat performs better than Bone, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat". +python bone_finetuning.py \ + --base_model_name_or_path meta-llama/Llama-2-7b-hf \ + --output_dir output/bone-llama-2-7b-metamath-10k \ + --bone_r 64 \ + --init_weights True \ + --bits bf16 \ + --data_path meta-math/MetaMathQA \ + --dataset_split train[:100000] \ + --dataset_field query response \ + --bf16 True \ + --num_train_epochs 1 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 8 \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 1 \ + --logging_steps 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --tf32 True \ + --report_to none +``` + + + +# Citation +```bib +@misc{kang2025dishadimensionshardingadaptationlarge, + title={DiSHA: Dimension-Sharding Adaptation of Large Language Models with Fast Convergence and Fast Computation}, + author={Jiale Kang}, + year={2025}, + eprint={2409.15371}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://huggingface.co/papers/2409.15371}, +} \ No newline at end of file diff --git a/peft/examples/bone_finetuning/bone_finetuning.py b/peft/examples/bone_finetuning/bone_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..25c8e4dcd653d526bb3c50285a222f2ca12cacd7 --- /dev/null +++ b/peft/examples/bone_finetuning/bone_finetuning.py @@ -0,0 +1,105 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass, field +from typing import Literal, Optional + +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser +from trl import SFTConfig, SFTTrainer + +from peft import BoneConfig, get_peft_model + + +@dataclass +class ScriptArguments(SFTConfig): + # model configs + base_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "The name or path of the fp32/16 base model."} + ) + bits: str = field(default="bf16", metadata={"help": "(`['bf16', 'fp16', fp32]`)"}) + init_weights: Literal[True, "bat"] = field( + default=True, + metadata={ + "help": ("True -> Bone; `bat` -> Bat"), + }, + ) + bone_r: int = field(default=16) + merge_and_save: bool = field(default=False) + # dataset configs + data_path: str = field(default="imdb", metadata={"help": "Path to the training data."}) + dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"}) + dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."}) + + +parser = HfArgumentParser(ScriptArguments) +script_args = parser.parse_args_into_dataclasses()[0] +print(script_args) + +print(f"Load pre-processed residual model in {script_args.bits} bits.") +if script_args.bits in ["nf4", "fp4", "int8"]: + print("Bone currently does not support quantization.") + +elif script_args.base_model_name_or_path is not None: + print(f"No available pre-processed model, manually initialize a Bone using {script_args.base_model_name_or_path}.") + model = AutoModelForCausalLM.from_pretrained( + script_args.base_model_name_or_path, + torch_dtype=( + torch.float16 + if script_args.bits == "fp16" + else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32) + ), + device_map="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path) + tokenizer.pad_token_id = tokenizer.eos_token_id + bone_config = BoneConfig( + r=script_args.bone_r, + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + bias="none", + task_type="CAUSAL_LM", + init_weights=script_args.init_weights, + ) + peft_model = get_peft_model(model, bone_config) + +print(peft_model) +peft_model.print_trainable_parameters() + +print(f"Training Bone with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.") +dataset = load_dataset(script_args.data_path, split=script_args.dataset_split) +dataset = dataset.map( + lambda example: { + "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}" + } +) + +trainer = SFTTrainer( + model=peft_model, + args=script_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +trainer.save_state() + +peft_model.save_pretrained( + os.path.join(script_args.output_dir, "bone_ft"), +) + +if script_args.merge_and_save: + model = peft_model.merge_and_unload() + model.save_pretrained(os.path.join(script_args.output_dir, "bone_merged")) + tokenizer.save_pretrained(os.path.join(script_args.output_dir, "bone_merged")) diff --git a/peft/examples/causal_language_modeling/accelerate_ds_zero3_cpu_offload_config.yaml b/peft/examples/causal_language_modeling/accelerate_ds_zero3_cpu_offload_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4a0bcfaf09bd632e6ac2152c8a4f30e183cc102 --- /dev/null +++ b/peft/examples/causal_language_modeling/accelerate_ds_zero3_cpu_offload_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +deepspeed_config: + gradient_accumulation_steps: 1 + gradient_clipping: 1.0 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +dynamo_backend: 'NO' +fsdp_config: {} +machine_rank: 0 +main_training_function: main +megatron_lm_config: {} +mixed_precision: 'no' +num_machines: 1 +num_processes: 1 +rdzv_backend: static +same_network: true +use_cpu: false \ No newline at end of file diff --git a/peft/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb b/peft/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f4f75e79c736a9b4304ab4b3b9565b178293fc51 --- /dev/null +++ b/peft/examples/causal_language_modeling/peft_ln_tuning_clm.ipynb @@ -0,0 +1,1391 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "71fbfca2", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import get_peft_config, get_peft_model, LNTuningConfig, TaskType, PeftType\n", + "import torch\n", + "from datasets import load_dataset\n", + "import os\n", + "from transformers import AutoTokenizer\n", + "from torch.utils.data import DataLoader\n", + "from transformers import default_data_collator, get_linear_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "\n", + "# Hyper-parameters\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "model_name_or_path = \"bigscience/bloomz-560m\"\n", + "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n", + "peft_config = LNTuningConfig(\n", + " task_type=TaskType.CAUSAL_LM,\n", + ")\n", + "\n", + "dataset_name = \"twitter_complaints\"\n", + "checkpoint_name = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "text_column = \"Tweet text\"\n", + "label_column = \"text_label\"\n", + "max_length = 64\n", + "lr = 5e-2\n", + "num_epochs = 50\n", + "batch_size = 8" + ] + }, + { + "cell_type": "markdown", + "id": "a617882d", + "metadata": {}, + "source": [ + "## Load and Process Dataset for LM Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a3648b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Unlabeled', 'complaint', 'no complaint']\n", + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['Tweet text', 'ID', 'Label', 'text_label'],\n", + " num_rows: 50\n", + " })\n", + " test: Dataset({\n", + " features: ['Tweet text', 'ID', 'Label', 'text_label'],\n", + " num_rows: 3399\n", + " })\n", + "})\n" + ] + }, + { + "data": { + "text/plain": [ + "{'Tweet text': '@HMRCcustomers No this is my first job',\n", + " 'ID': 0,\n", + " 'Label': 2,\n", + " 'text_label': 'no complaint'}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = load_dataset(\n", + " \"parquet\",\n", + " data_files={\n", + " \"train\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet\",\n", + " \"test\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet\"\n", + " }\n", + ")\n", + "\n", + "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n", + "print(classes)\n", + "dataset = dataset.map(\n", + " lambda x: {\"text_label\": [classes[label] for label in x[\"Label\"]]},\n", + " batched=True,\n", + " num_proc=1,\n", + ")\n", + "print(dataset)\n", + "dataset[\"train\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fe12d4d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Running tokenizer on dataset: 100%|██████████| 50/50 [00:00<00:00, 3551.43 examples/s]\n", + "Running tokenizer on dataset: 100%|██████████| 3399/3399 [00:00<00:00, 8558.01 examples/s]\n" + ] + } + ], + "source": [ + "# data preprocessing\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", + "if tokenizer.pad_token_id is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id\n", + "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n", + "print(target_max_length)\n", + "\n", + "\n", + "def preprocess_function(examples):\n", + " batch_size = len(examples[text_column])\n", + " inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n", + " targets = [str(x) for x in examples[label_column]]\n", + " model_inputs = tokenizer(inputs)\n", + " labels = tokenizer(targets, add_special_tokens=False) # don't add bos token because we concatenate with inputs\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n", + " # print(i, sample_input_ids, label_input_ids)\n", + " model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n", + " labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [1] * len(model_inputs[\"input_ids\"][i])\n", + " # print(model_inputs)\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " label_input_ids = labels[\"input_ids\"][i]\n", + " model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n", + " max_length - len(sample_input_ids)\n", + " ) + sample_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n", + " \"attention_mask\"\n", + " ][i]\n", + " labels[\"input_ids\"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids\n", + " model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n", + " labels[\"input_ids\"][i] = torch.tensor(labels[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", + " return model_inputs\n", + "\n", + "\n", + "processed_datasets = dataset.map(\n", + " preprocess_function,\n", + " batched=True,\n", + " num_proc=1,\n", + " remove_columns=dataset[\"train\"].column_names,\n", + " load_from_cache_file=False,\n", + " desc=\"Running tokenizer on dataset\",\n", + ")\n", + "\n", + "train_dataset = processed_datasets[\"train\"]\n", + "eval_dataset = processed_datasets[\"train\"]\n", + "\n", + "\n", + "train_dataloader = DataLoader(\n", + " train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n", + ")\n", + "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "641b21fe", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Running tokenizer on dataset: 100%|██████████| 3399/3399 [00:00<00:00, 17380.64 examples/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 227985, 5484, 915, 2566, 74757, 64626, 12384, 44639, 613,\n", + " 52282, 2670, 79920, 3344, 1002, 368, 17646, 14472, 8348,\n", + " 664, 718, 4, 19036, 17, 31849, 17, 6312, 76,\n", + " 44, 62470, 56, 91, 50, 14839, 21, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 227985, 5484, 915, 405, 187059,\n", + " 2256, 664, 2550, 18833, 18607, 162467, 4, 1387, 6199,\n", + " 3291, 23405, 613, 4657, 17082, 566, 3432, 368, 78851,\n", + " 1185, 61273, 23181, 1553, 15596, 212, 116057, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 227985, 5484,\n", + " 915, 39762, 2566, 22253, 6201, 75701, 15, 632, 718,\n", + " 5840, 10006, 6201, 18881, 427, 3804, 19528, 267, 158974,\n", + " 1320, 368, 10029, 632, 49666, 92, 34, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 227985, 5484, 915, 2566, 104565, 8695, 2089, 6140,\n", + " 109676, 99579, 1369, 512, 368, 4570, 54, 632, 368,\n", + " 1503, 241485, 132226, 15, 982, 727, 1152, 18100, 861,\n", + " 32596, 77597, 168154, 1306, 132226, 4346, 87843, 17, 130462,\n", + " 364, 32923, 89, 53, 8309, 20, 75, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 227985, 5484, 915, 2566,\n", + " 14173, 2960, 29906, 387, 20706, 49337, 1369, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 227985, 5484, 915, 2566, 219553, 45736,\n", + " 36876, 1713, 72, 707, 187205, 13002, 177324, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 227985, 5484, 915, 2566, 233938, 28518, 13716,\n", + " 427, 28146, 1119, 17918, 17, 236706, 368, 214997, 7555,\n", + " 48659, 5276, 21600, 343, 17, 51416, 22403, 318, 1531,\n", + " 1306, 1130, 20934, 567, 101161, 184849, 87843, 17, 1594,\n", + " 15231, 2052, 16642, 20, 7180, 80, 26, 77658, 915,\n", + " 210],\n", + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 3, 3, 3, 3, 3, 3, 3, 3, 3,\n", + " 227985, 5484, 915, 2566, 80, 2068, 479, 2566, 80,\n", + " 1376, 878, 147587, 3904, 632, 368, 6084, 65673, 78851,\n", + " 11736, 15527, 19082, 33151, 461, 17, 45575, 17887, 632,\n", + " 5219, 14216, 68870, 5967, 1841, 4346, 87843, 17, 1594,\n", + " 14512, 27, 71, 8184, 19, 290, 63748, 77658, 915,\n", + " 210]]),\n", + " 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def test_preprocess_function(examples):\n", + " batch_size = len(examples[text_column])\n", + " inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n", + " model_inputs = tokenizer(inputs)\n", + " # print(model_inputs)\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n", + " max_length - len(sample_input_ids)\n", + " ) + sample_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n", + " \"attention_mask\"\n", + " ][i]\n", + " model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n", + " return model_inputs\n", + "\n", + "\n", + "test_dataset = dataset[\"test\"].map(\n", + " test_preprocess_function,\n", + " batched=True,\n", + " num_proc=1,\n", + " remove_columns=dataset[\"train\"].column_names,\n", + " load_from_cache_file=False,\n", + " desc=\"Running tokenizer on dataset\",\n", + ")\n", + "\n", + "test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n", + "next(iter(test_dataloader))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "218df807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "425" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# show dataset size\n", + "len(test_dataloader)" + ] + }, + { + "cell_type": "markdown", + "id": "aa55f803", + "metadata": {}, + "source": [ + "## Train the LM with LNTuning\n", + "1. Create the base LM.\n", + "2. Only activate the LayerNorm layers in the LM for training.\n", + "3. Train the LM on the training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a773e092", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 100,352 || all params: 559,314,944 || trainable%: 0.017941948642087417\n" + ] + } + ], + "source": [ + "# 1. creating the base LM\n", + "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", + "# 2. Only activate the LayerNorm layers in the Attention blocks in the LM for training\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b2f91568", + "metadata": {}, + "outputs": [], + "source": [ + "# setup the optimizer and lr scheduler\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e4fb69fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 7.09it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 23.05it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0: train_ppl=tensor(8.1918, device='cuda:0') train_epoch_loss=tensor(2.1031, device='cuda:0') eval_ppl=tensor(2.1760, device='cuda:0') eval_epoch_loss=tensor(0.7775, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.88it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 23.11it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=1: train_ppl=tensor(1.8009, device='cuda:0') train_epoch_loss=tensor(0.5883, device='cuda:0') eval_ppl=tensor(2.1198, device='cuda:0') eval_epoch_loss=tensor(0.7513, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.87it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 23.08it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=2: train_ppl=tensor(2.0387, device='cuda:0') train_epoch_loss=tensor(0.7123, device='cuda:0') eval_ppl=tensor(1.6793, device='cuda:0') eval_epoch_loss=tensor(0.5184, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.92it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 23.03it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=3: train_ppl=tensor(1.4885, device='cuda:0') train_epoch_loss=tensor(0.3978, device='cuda:0') eval_ppl=tensor(1.2918, device='cuda:0') eval_epoch_loss=tensor(0.2561, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.89it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 23.00it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=4: train_ppl=tensor(1.3062, device='cuda:0') train_epoch_loss=tensor(0.2671, device='cuda:0') eval_ppl=tensor(1.3259, device='cuda:0') eval_epoch_loss=tensor(0.2821, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.79it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.92it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=5: train_ppl=tensor(1.3129, device='cuda:0') train_epoch_loss=tensor(0.2722, device='cuda:0') eval_ppl=tensor(1.2315, device='cuda:0') eval_epoch_loss=tensor(0.2082, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.83it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.93it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=6: train_ppl=tensor(1.2605, device='cuda:0') train_epoch_loss=tensor(0.2315, device='cuda:0') eval_ppl=tensor(1.2705, device='cuda:0') eval_epoch_loss=tensor(0.2394, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.87it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.79it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=7: train_ppl=tensor(1.2452, device='cuda:0') train_epoch_loss=tensor(0.2193, device='cuda:0') eval_ppl=tensor(1.2103, device='cuda:0') eval_epoch_loss=tensor(0.1909, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.88it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.87it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=8: train_ppl=tensor(1.2185, device='cuda:0') train_epoch_loss=tensor(0.1976, device='cuda:0') eval_ppl=tensor(1.2127, device='cuda:0') eval_epoch_loss=tensor(0.1929, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.83it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.80it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=9: train_ppl=tensor(1.1868, device='cuda:0') train_epoch_loss=tensor(0.1713, device='cuda:0') eval_ppl=tensor(1.1765, device='cuda:0') eval_epoch_loss=tensor(0.1625, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.83it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.99it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=10: train_ppl=tensor(1.1905, device='cuda:0') train_epoch_loss=tensor(0.1744, device='cuda:0') eval_ppl=tensor(1.1539, device='cuda:0') eval_epoch_loss=tensor(0.1431, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.80it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.94it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=11: train_ppl=tensor(1.1475, device='cuda:0') train_epoch_loss=tensor(0.1376, device='cuda:0') eval_ppl=tensor(1.1238, device='cuda:0') eval_epoch_loss=tensor(0.1167, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.81it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.75it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=12: train_ppl=tensor(1.1099, device='cuda:0') train_epoch_loss=tensor(0.1043, device='cuda:0') eval_ppl=tensor(1.0859, device='cuda:0') eval_epoch_loss=tensor(0.0824, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.77it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.95it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=13: train_ppl=tensor(1.0798, device='cuda:0') train_epoch_loss=tensor(0.0768, device='cuda:0') eval_ppl=tensor(1.1151, device='cuda:0') eval_epoch_loss=tensor(0.1089, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.87it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.91it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=14: train_ppl=tensor(1.0622, device='cuda:0') train_epoch_loss=tensor(0.0604, device='cuda:0') eval_ppl=tensor(1.0347, device='cuda:0') eval_epoch_loss=tensor(0.0341, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.86it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.88it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=15: train_ppl=tensor(1.0188, device='cuda:0') train_epoch_loss=tensor(0.0186, device='cuda:0') eval_ppl=tensor(1.0169, device='cuda:0') eval_epoch_loss=tensor(0.0168, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.87it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.86it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=16: train_ppl=tensor(1.0085, device='cuda:0') train_epoch_loss=tensor(0.0085, device='cuda:0') eval_ppl=tensor(1.0047, device='cuda:0') eval_epoch_loss=tensor(0.0047, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.80it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.71it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=17: train_ppl=tensor(1.0041, device='cuda:0') train_epoch_loss=tensor(0.0041, device='cuda:0') eval_ppl=tensor(1.0013, device='cuda:0') eval_epoch_loss=tensor(0.0013, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.82it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.86it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=18: train_ppl=tensor(1.0010, device='cuda:0') train_epoch_loss=tensor(0.0010, device='cuda:0') eval_ppl=tensor(1.0010, device='cuda:0') eval_epoch_loss=tensor(0.0010, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.77it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.85it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=19: train_ppl=tensor(1.0007, device='cuda:0') train_epoch_loss=tensor(0.0007, device='cuda:0') eval_ppl=tensor(1.0005, device='cuda:0') eval_epoch_loss=tensor(0.0005, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.77it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.80it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=20: train_ppl=tensor(1.0004, device='cuda:0') train_epoch_loss=tensor(0.0004, device='cuda:0') eval_ppl=tensor(1.0004, device='cuda:0') eval_epoch_loss=tensor(0.0004, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.78it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.78it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=21: train_ppl=tensor(1.0003, device='cuda:0') train_epoch_loss=tensor(0.0003, device='cuda:0') eval_ppl=tensor(1.0003, device='cuda:0') eval_epoch_loss=tensor(0.0003, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.73it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.80it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=22: train_ppl=tensor(1.0003, device='cuda:0') train_epoch_loss=tensor(0.0003, device='cuda:0') eval_ppl=tensor(1.0003, device='cuda:0') eval_epoch_loss=tensor(0.0003, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.68it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.77it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=23: train_ppl=tensor(1.0003, device='cuda:0') train_epoch_loss=tensor(0.0003, device='cuda:0') eval_ppl=tensor(1.0003, device='cuda:0') eval_epoch_loss=tensor(0.0003, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.79it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=24: train_ppl=tensor(1.0003, device='cuda:0') train_epoch_loss=tensor(0.0003, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.69it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.64it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=25: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.66it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.64it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=26: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.75it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.75it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=27: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.76it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.61it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=28: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.73it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=29: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.73it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.56it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=30: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.72it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=31: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.75it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.71it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=32: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.72it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.64it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=33: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0002, device='cuda:0') eval_epoch_loss=tensor(0.0002, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.72it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.64it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=34: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.70it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.58it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=35: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.68it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.41it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=36: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.72it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=37: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.64it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.58it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=38: train_ppl=tensor(1.0002, device='cuda:0') train_epoch_loss=tensor(0.0002, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.66it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.65it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=39: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.71it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=40: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.64it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=41: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.64it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.59it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=42: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.68it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=43: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.70it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.54it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=44: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.67it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.61it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=45: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.69it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=46: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.66it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.39it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=47: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.60it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.50it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=48: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 7/7 [00:00<00:00, 10.62it/s]\n", + "100%|██████████| 7/7 [00:00<00:00, 22.52it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=49: train_ppl=tensor(1.0001, device='cuda:0') train_epoch_loss=tensor(0.0001, device='cuda:0') eval_ppl=tensor(1.0001, device='cuda:0') eval_epoch_loss=tensor(0.0001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# 3. train the LM on the training dataset\n", + "model = model.to(device)\n", + "\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for step, batch in enumerate(tqdm(train_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " # print(batch)\n", + " # print(batch[\"input_ids\"].shape)\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " total_loss += loss.detach().float()\n", + " loss.backward()\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + "\n", + " model.eval()\n", + " eval_loss = 0\n", + " eval_preds = []\n", + " for step, batch in enumerate(tqdm(eval_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " with torch.no_grad():\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " eval_loss += loss.detach().float()\n", + " eval_preds.extend(\n", + " tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n", + " )\n", + "\n", + " eval_epoch_loss = eval_loss / len(eval_dataloader)\n", + " eval_ppl = torch.exp(eval_epoch_loss)\n", + " train_epoch_loss = total_loss / len(train_dataloader)\n", + " train_ppl = torch.exp(train_epoch_loss)\n", + " print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fbf339a2", + "metadata": {}, + "source": [ + "## Test the LM" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "53752a7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand > different sizing\n", + "{'input_ids': tensor([[227985, 5484, 915, 2566, 226154, 126015, 5385, 259, 239364,\n", + " 3396, 70823, 5853, 17, 57247, 1231, 191040, 5025, 7869,\n", + " 375, 2324, 149349, 12, 415, 122321, 897, 415, 10136,\n", + " 10021, 897, 415, 10136, 6497, 381, 915, 5025, 51950,\n", + " 66869, 5955, 272, 20311, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 2566, 226154, 126015, 5385, 259, 239364,\n", + " 3396, 70823, 5853, 17, 57247, 1231, 191040, 5025, 7869,\n", + " 375, 2324, 149349, 12, 415, 122321, 897, 415, 10136,\n", + " 10021, 897, 415, 10136, 6497, 381, 915, 5025, 51950,\n", + " 66869, 5955, 272, 20311, 77658, 915, 210, 1936, 106863,\n", + " 2, 1936, 106863, 2, 1936, 106863, 2, 1936]],\n", + " device='cuda:0')\n", + "['Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand > different sizing Label : no complaintno complaintno complaintno']\n" + ] + } + ], + "source": [ + "model.eval()\n", + "i = 33\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "c8f35152", + "metadata": {}, + "source": [ + "## Save the trainable LM weights (LayerNorm layers)\n", + "You can push model to hub or save model locally. \n", + "\n", + "- Option1: Push the model to Hugging Face Hub:\n", + "\n", + " ```python\n", + " model.push_to_hub(\n", + " f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n", + " token = \"hf_...\"\n", + " )\n", + " ```\n", + " token (`bool` or `str`, *optional*):\n", + " `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n", + " when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n", + " is not specified.\n", + " Or you can get your token from https://huggingface.co/settings/token\n", + " ```\n", + "- Option2: Save model locally:\n", + "\n", + " ```python\n", + " peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n", + " model.save_pretrained(peft_model_id)\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d8ba1f8c", + "metadata": {}, + "outputs": [], + "source": [ + "# saving model\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "model.save_pretrained(peft_model_id)" + ] + }, + { + "cell_type": "markdown", + "id": "4dd7ab9c", + "metadata": {}, + "source": [ + "## Test the LM using LNTuning loaded from saved weights\n", + "1. load the LNTuning configuration\n", + "2. load the base LM\n", + "3. merge the LNTuning weights into the base LM using the PEFT config" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4d9476e1", + "metadata": {}, + "outputs": [], + "source": [ + "from peft import PeftModel, PeftConfig\n", + "\n", + "# load the LNTuning config\n", + "config = PeftConfig.from_pretrained(peft_model_id)\n", + "# load the base LM\n", + "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n", + "# merge LNTuning weights into the base LM\n", + "model = PeftModel.from_pretrained(model, peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ebe174a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@greateranglia Ok thanks...\n", + "{'input_ids': tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210, 1936, 106863, 2, 1936,\n", + " 106863, 2, 1936, 106863, 2, 1936]], device='cuda:0')\n", + "['Tweet text : @greateranglia Ok thanks... Label : no complaintno complaintno complaintno']\n" + ] + } + ], + "source": [ + "model.to(device)\n", + "model.eval()\n", + "i = 4\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py b/peft/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py new file mode 100644 index 0000000000000000000000000000000000000000..d908530f97f222df200e37ce60fa70c0fca9bfaf --- /dev/null +++ b/peft/examples/causal_language_modeling/peft_lora_clm_accelerate_ds_zero3_offload.py @@ -0,0 +1,381 @@ +import gc +import os +import sys +import threading + +import psutil +import torch +from accelerate import Accelerator +from datasets import load_dataset +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + default_data_collator, + get_linear_schedule_with_warmup, + set_seed, +) + +from peft import LoraConfig, TaskType, get_peft_model + + +def levenshtein_distance(str1, str2): + # TC: O(N^2) + # SC: O(N) + if str1 == str2: + return 0 + num_rows = len(str1) + 1 + num_cols = len(str2) + 1 + dp_matrix = list(range(num_cols)) + for i in range(1, num_rows): + prev = dp_matrix[0] + dp_matrix[0] = i + for j in range(1, num_cols): + temp = dp_matrix[j] + if str1[i - 1] == str2[j - 1]: + dp_matrix[j] = prev + else: + dp_matrix[j] = min(prev, dp_matrix[j], dp_matrix[j - 1]) + 1 + prev = temp + return dp_matrix[num_cols - 1] + + +def get_closest_label(eval_pred, classes): + min_id = sys.maxsize + min_edit_distance = sys.maxsize + for i, class_label in enumerate(classes): + edit_distance = levenshtein_distance(eval_pred.strip(), class_label) + if edit_distance < min_edit_distance: + min_id = i + min_edit_distance = edit_distance + return classes[min_id] + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + gc.collect() + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") + + +def main(): + accelerator = Accelerator() + model_name_or_path = "bigscience/bloomz-7b1" + dataset_name = "twitter_complaints" + peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) + text_column = "Tweet text" + label_column = "text_label" + lr = 3e-3 + num_epochs = 20 + batch_size = 8 + seed = 42 + max_length = 64 + do_test = False + set_seed(seed) + + dataset = load_dataset( + "parquet", + data_files={ + "train": f"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet", + "test": f"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet", + }, + ) + classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names] + dataset = dataset.map( + lambda x: {"text_label": [classes[label] for label in x["Label"]]}, + batched=True, + num_proc=1, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + + def preprocess_function(examples): + batch_size = len(examples[text_column]) + inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] + targets = [str(x) for x in examples[label_column]] + model_inputs = tokenizer(inputs) + labels = tokenizer(targets, add_special_tokens=False) # don't add bos token because we concatenate with inputs + for i in range(batch_size): + sample_input_ids = model_inputs["input_ids"][i] + label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id] + model_inputs["input_ids"][i] = sample_input_ids + label_input_ids + labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids + model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i]) + for i in range(batch_size): + sample_input_ids = model_inputs["input_ids"][i] + label_input_ids = labels["input_ids"][i] + model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( + max_length - len(sample_input_ids) + ) + sample_input_ids + model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ + "attention_mask" + ][i] + labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids + model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length]) + model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length]) + labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length]) + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + def test_preprocess_function(examples): + batch_size = len(examples[text_column]) + inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]] + model_inputs = tokenizer(inputs) + for i in range(batch_size): + sample_input_ids = model_inputs["input_ids"][i] + model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * ( + max_length - len(sample_input_ids) + ) + sample_input_ids + model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[ + "attention_mask" + ][i] + model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length]) + model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length]) + return model_inputs + + with accelerator.main_process_first(): + processed_datasets = dataset.map( + preprocess_function, + batched=True, + num_proc=1, + remove_columns=dataset["train"].column_names, + load_from_cache_file=True, + desc="Running tokenizer on dataset", + ) + accelerator.wait_for_everyone() + + train_dataset = processed_datasets["train"] + + with accelerator.main_process_first(): + processed_datasets = dataset.map( + test_preprocess_function, + batched=True, + num_proc=1, + remove_columns=dataset["train"].column_names, + load_from_cache_file=False, + desc="Running tokenizer on dataset", + ) + eval_dataset = processed_datasets["train"] + test_dataset = processed_datasets["test"] + + train_dataloader = DataLoader( + train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True + ) + eval_dataloader = DataLoader( + eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True + ) + test_dataloader = DataLoader( + test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True + ) + + print(next(iter(train_dataloader))) + + # creating model + model = AutoModelForCausalLM.from_pretrained(model_name_or_path) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + # optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=lr) + + # lr scheduler + lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=(len(train_dataloader) * num_epochs), + ) + + model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler = accelerator.prepare( + model, train_dataloader, eval_dataloader, test_dataloader, optimizer, lr_scheduler + ) + accelerator.print(model) + + is_ds_zero_3 = False + if getattr(accelerator.state, "deepspeed_plugin", None): + is_ds_zero_3 = accelerator.state.deepspeed_plugin.zero_stage == 3 + + for epoch in range(num_epochs): + with TorchTracemalloc() as tracemalloc: + model.train() + total_loss = 0 + for step, batch in enumerate(tqdm(train_dataloader)): + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + # Printing the memory usage details such as allocated memory, peak memory, and total memory usage + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + train_epoch_loss = total_loss / len(train_dataloader) + train_ppl = torch.exp(train_epoch_loss) + accelerator.print(f"{epoch=}: {train_ppl=} {train_epoch_loss=}") + + model.eval() + eval_preds = [] + with TorchTracemalloc() as tracemalloc: + for _, batch in enumerate(tqdm(eval_dataloader)): + batch = {k: v for k, v in batch.items() if k != "labels"} + with torch.no_grad(): + outputs = accelerator.unwrap_model(model).generate( + **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10 + ) # synced_gpus=True for DS-stage 3 + outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id) + preds = accelerator.gather_for_metrics(outputs) + preds = preds[:, max_length:].detach().cpu().numpy() + eval_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True)) + + # Printing the memory usage details such as allocated memory, peak memory, and total memory usage + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the eval : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the eval (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the eval (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the eval (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the eval : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the eval (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the eval (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the eval (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + correct = 0 + total = 0 + assert len(eval_preds) == len(dataset["train"][label_column]), ( + f"{len(eval_preds)} != {len(dataset['train'][label_column])}" + ) + for pred, true in zip(eval_preds, dataset["train"][label_column]): + if pred.strip() == true.strip(): + correct += 1 + total += 1 + accuracy = correct / total * 100 + accelerator.print(f"{accuracy=}") + accelerator.print(f"{eval_preds[:10]=}") + accelerator.print(f"{dataset['train'][label_column][:10]=}") + + if do_test: + model.eval() + test_preds = [] + for _, batch in enumerate(tqdm(test_dataloader)): + batch = {k: v for k, v in batch.items() if k != "labels"} + with torch.no_grad(): + outputs = accelerator.unwrap_model(model).generate( + **batch, synced_gpus=is_ds_zero_3, max_new_tokens=10 + ) # synced_gpus=True for DS-stage 3 + outputs = accelerator.pad_across_processes(outputs, dim=1, pad_index=tokenizer.pad_token_id) + preds = accelerator.gather(outputs) + preds = preds[:, max_length:].detach().cpu().numpy() + test_preds.extend(tokenizer.batch_decode(preds, skip_special_tokens=True)) + + test_preds_cleaned = [] + for _, pred in enumerate(test_preds): + test_preds_cleaned.append(get_closest_label(pred, classes)) + + test_df = dataset["test"].to_pandas() + assert len(test_preds_cleaned) == len(test_df), f"{len(test_preds_cleaned)} != {len(test_df)}" + test_df[label_column] = test_preds_cleaned + test_df["text_labels_orig"] = test_preds + accelerator.print(test_df[[text_column, label_column]].sample(20)) + + pred_df = test_df[["ID", label_column]] + pred_df.columns = ["ID", "Label"] + + os.makedirs(f"data/{dataset_name}", exist_ok=True) + pred_df.to_csv(f"data/{dataset_name}/predictions.csv", index=False) + + accelerator.wait_for_everyone() + # Option1: Pushing the model to Hugging Face Hub + # model.push_to_hub( + # f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace("/", "_"), + # token = "hf_..." + # ) + # token (`bool` or `str`, *optional*): + # `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated + # when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url` + # is not specified. + # Or you can get your token from https://huggingface.co/settings/token + # Option2: Saving the model locally + peft_model_id = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}".replace( + "/", "_" + ) + model.save_pretrained(peft_model_id) + accelerator.wait_for_everyone() + + +if __name__ == "__main__": + main() diff --git a/peft/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb b/peft/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..de6edd3e7e0bbccbc9338a0a53171f91b62fb43a --- /dev/null +++ b/peft/examples/causal_language_modeling/peft_lora_clm_with_additional_tokens.ipynb @@ -0,0 +1,1012 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5f239612-620e-4430-8685-9fdc6b179b41", + "metadata": {}, + "source": [ + "# Training PEFT models with new tokens being added to the embedding layers and tokenizer\n", + "\n", + "In this example, we will learn how to train a LoRA model when adding new tokens to the tokenizer and model. \n", + "This is a common usecase when doing the following:\n", + "1. Instruction finetuning with new tokens beind added such as `<|user|>`, `<|assistant|>`, `<|system|>`, ``, `` to properly format the conversations\n", + "2. Finetuning on a specific language wherein language spoecific tokens are added, e.g., korean tokens being added to vocabulary for finetuning LLM on Korean datasets.\n", + "3. Instruction finetuning to return outputs in certain format to enable agent behaviour new tokens such as `<|FUNCTIONS|>`, `<|BROWSE|>`, `<|TEXT2IMAGE|>`, `<|ASR|>`, `<|TTS|>`, `<|GENERATECODE|>`, `<|RAG|>`.\n", + "\n", + "In such cases, you add the Embedding modules to the LORA `target_modules`. PEFT will take care of saving the embedding layers with the new added tokens along with the adapter weights that were trained on the specific initialization of the embeddings weights of the added tokens." + ] + }, + { + "cell_type": "markdown", + "id": "b27c55e8-edaa-4059-90bc-d6096d596902", + "metadata": {}, + "source": [ + "Let's import the necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f864c90", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"WANDB_PROJECT\"] = \"PeftExamples\"\n", + "import transformers\n", + "from peft import (\n", + " LoraConfig,\n", + " PeftConfig,\n", + " PeftModel,\n", + " get_peft_model,\n", + " prepare_model_for_kbit_training,\n", + ")\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " HfArgumentParser,\n", + " TrainingArguments,\n", + " Trainer,\n", + " default_data_collator,\n", + ")\n", + "import torch\n", + "from dataclasses import dataclass, field\n", + "from typing import Optional\n", + "from dataclass_csv import DataclassReader\n", + "from torch.utils.data import Dataset, DataLoader\n", + "\n", + "from enum import Enum" + ] + }, + { + "cell_type": "markdown", + "id": "74950a3f-bb63-4ce5-9e2b-1b83f92b13a2", + "metadata": {}, + "source": [ + "## Prepare Model and Tokenizer" + ] + }, + { + "cell_type": "markdown", + "id": "76763f5e-64b2-409b-8845-ae5589f8a4e0", + "metadata": {}, + "source": [ + "Now, we will be adding 27 new tokens as well as replace the existing pad, bos and eos tokens of the model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fd0498ea-547e-418d-bf13-c9abafdd5476", + "metadata": {}, + "outputs": [], + "source": [ + "class SpecialTokens(str, Enum):\n", + " begin_target = \"<|begintarget|>\"\n", + " end_target = \"<|endtarget|>\"\n", + " begin_context = \"<|begincontext|>\"\n", + " end_context = \"<|endcontext|>\"\n", + " system = \"<|system|>\"\n", + " user = \"<|user|>\"\n", + " begin_last_user_utterance = \"<|beginlastuserutterance|>\"\n", + " end_last_user_utterance = \"<|endlastuserutterance|>\"\n", + " begin_dsts = \"<|begindsts|>\"\n", + " end_dsts = \"<|enddsts|>\"\n", + " begin_dst = \"<|begindst|>\"\n", + " end_dst = \"<|enddst|>\"\n", + " begin_belief = \"<|beginbelief|>\"\n", + " end_belief = \"<|endbelief|>\"\n", + " begin_response = \"<|beginresponse|>\"\n", + " end_response = \"<|endresponse|>\"\n", + " begin_action = \"<|beginaction|>\"\n", + " end_action = \"<|endaction|>\"\n", + " begin_user_action = \"<|beginuseraction|>\"\n", + " end_user_action = \"<|enduseraction|>\"\n", + " sys_actions = \"<|sysactions|>\"\n", + " begin_intent = \"<|beginintent|>\"\n", + " end_intent = \"<|endintent|>\"\n", + " begin_requested_slots = \"<|beginrequestedslots|>\"\n", + " end_requested_slots = \"<|endrequestedslots|>\"\n", + " pad_token = \"<|pad|>\"\n", + " bos_token = \"<|startoftext|>\"\n", + "\n", + " @classmethod\n", + " def list(cls):\n", + " return [c.value for c in cls]" + ] + }, + { + "cell_type": "markdown", + "id": "ae4a4255-5f13-4eef-a024-4f1de0f2173b", + "metadata": {}, + "source": [ + "We will be finetuning Mistral-7B model. Let's load the tokenizer and add the special tokens followed by loading the base model and resizzing the embedding layers to accomodate the newly added tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f0eedef9", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91c67b6377fc4dd7977bf544de784d51", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00<|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|startoftext|><|begincontext|><|user|> Can you find me place to eat?<|system|> What kind of food would you like to have and where would you like me to search in?<|user|> Food kind of California will be perfect in SF.<|system|> There are 10 restaurants, Al's Place is one of the good restaurant in San Francisco.<|user|> Can you look for any other restaurant?<|system|> Alta Msp is one of the good restaurant in San Francisco.<|beginlastuserutterance|> Can you find me the address?<|endlastuserutterance|><|endcontext|><|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginrequestedslots|> Restaurants^street_address<|endrequestedslots|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->California<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^street_address~1275 Minnesota Street<|endaction|><|beginresponse|> The street address of the restaurant is 1275 Minnesota Street.<|endresponse|><|endtarget|><|endtarget|>\"" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.decode(train_dataset[0][\"input_ids\"])" + ] + }, + { + "cell_type": "markdown", + "id": "239d1c83-196d-471e-9bf7-5f36dafa9894", + "metadata": {}, + "source": [ + "# Train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ec80d6ee", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n", + "Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33msmangrul\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n" + ] + }, + { + "data": { + "text/html": [ + "Tracking run with wandb version 0.16.0" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Run data is saved locally in /raid/sourab/temp/wandb/run-20231128_230934-edod21gq" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Syncing run ethereal-eon-1 to Weights & Biases (docs)
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View project at https://wandb.ai/smangrul/PeftExamples" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + " View run at https://wandb.ai/smangrul/PeftExamples/runs/edod21gq" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [246/246 05:51, Epoch 2/2]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
105.189800
203.745500
302.371500
401.630200
501.302600
600.999400
700.704100
800.527800
900.509700
1000.382300
1100.318200
1200.323500
1300.263400
1400.290900
1500.277400
1600.232800
1700.223600
1800.229600
1900.233100
2000.210200
2100.245800
2200.197300
2300.210100
2400.209800

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=246, training_loss=0.8516577879587809, metrics={'train_runtime': 354.9013, 'train_samples_per_second': 5.556, 'train_steps_per_second': 0.693, 'total_flos': 4.318233532091597e+16, 'train_loss': 0.8516577879587809, 'epoch': 2.0})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir=\"mistral_lora_clm_with_added_tokens\",\n", + " num_train_epochs=2,\n", + " save_total_limit=5,\n", + " per_device_train_batch_size=8,\n", + " warmup_steps=10,\n", + " weight_decay=0.0001,\n", + " dataloader_drop_last=True,\n", + " bf16=True,\n", + " logging_steps=10,\n", + " learning_rate=1e-5,\n", + " gradient_checkpointing=True,\n", + " gradient_checkpointing_kwargs={\"use_reentrant\": False},\n", + " remove_unused_columns=False,\n", + " hub_model_id=\"smangrul/mistral_lora_clm_with_added_tokens\",\n", + " push_to_hub=True,\n", + " hub_private_repo=True,\n", + ")\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " data_collator=default_data_collator,\n", + ")\n", + "# model.config.use_cache = False\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "7bc1cbed-4eb9-4aaa-ab5f-5b91bf432307", + "metadata": {}, + "source": [ + "# Check the model output on a sample from evaluation dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71851793", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "context=\"<|begincontext|><|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n", + "\n", + " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurants<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> Great, the phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n", + "\n", + " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n" + ] + } + ], + "source": [ + "import random\n", + "\n", + "i = random.randint(0, len(dataset[\"test\"]))\n", + "context = dataset[\"test\"][i][\"context\"]\n", + "\n", + "batch = tokenizer(context, return_tensors=\"pt\")\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "batch = {k: v.to(device) for k, v in batch.items()}\n", + "model.eval()\n", + "output_tokens = model.generate(\n", + " **batch,\n", + " max_new_tokens=256,\n", + " do_sample=True,\n", + " temperature=0.2,\n", + " top_p=0.95,\n", + " top_k=50,\n", + " eos_token_id=tokenizer.eos_token_id,\n", + " pad_token_id=tokenizer.pad_token_id,\n", + ")\n", + "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n", + "target = dataset[\"test\"][i][\"target\"]\n", + "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f940a660-2f7c-4a3a-b412-3f037aedb890", + "metadata": {}, + "source": [ + "# Save the Adapter model " + ] + }, + { + "cell_type": "markdown", + "id": "7ebe05e9-9b93-42f6-bba8-46b8cc3d100f", + "metadata": {}, + "source": [ + "When the lora layers are applied to embedding layers, the corresponding base model embedding layers are also saved. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3d7459ba-caa8-4f10-aa70-89be4541cbdf", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/raid/sourab/peft/src/peft/utils/save_and_load.py:128: UserWarning: Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\n", + " warnings.warn(\"Setting `is_embedding_layer_resized` to `True` as embedding layers found in `target_modules`\")\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8d23186832014f209939ab83e79da011", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 3 LFS files: 0%| | 0/3 [00:00<|user|>Can you find me a place to eat please?<|system|>Where at? And what kind of cuisine are you craving?<|user|>Somewhere in SF, and I am really craving Thai food at the moment!<|system|>I found a bunch of restaurants, there's actually 10 that you might like in San Francisco, one of them being Baan Thai House & Wine Bar<|user|>How can I reach them? And what's their address?<|system|>You can reach them by phone at 415-379-4505 and visit them at 534 Irving Street<|beginlastuserutterance|>Great, that restaurant sounds good<|endlastuserutterance|><|endcontext|>\" \n", + "\n", + " target_predicted='<|begintarget|><|begindsts|><|begindst|><|beginintent|> FindRestaurant<|endintent|><|beginbelief|> Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|> REQUEST->Restaurants^phone_number~|REQUEST->Restaurants^street_address~<|enduseraction|><|beginaction|> INFORM->Restaurants^phone_number~415-379-4505|INFORM->Restaurants^street_address~534 Irving Street<|endaction|><|beginresponse|> The phone number is 415-379-4505 and the address is 534 Irving Street<|endresponse|><|endtarget|>' \n", + "\n", + " target='<|begintarget|><|begindsts|><|begindst|><|beginintent|>FindRestaurants<|endintent|><|beginbelief|>Restaurants^city->SF~San Francisco|Restaurants^cuisine->Thai|Restaurants^restaurant_name->Baan Thai House & Wine Bar<|endbelief|><|enddst|><|enddsts|><|beginuseraction|>SELECT->Restaurants^~<|enduseraction|><|beginaction|>OFFER_INTENT->Restaurants^intent~ReserveRestaurant<|endaction|><|beginresponse|>Want me to book a table?<|endresponse|><|endtarget|>'\n" + ] + } + ], + "source": [ + "from peft import PeftModel\n", + "\n", + "inference_model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " low_cpu_mem_usage=True,\n", + " # attn_implementation =\"flash_attention_2\",\n", + ")\n", + "inference_model.resize_token_embeddings(len(tokenizer))\n", + "\n", + "inference_model = PeftModel.from_pretrained(inference_model, \"smangrul/mistral_lora_clm_with_added_tokens\")\n", + "inference_model.to(device)\n", + "inference_model.eval()\n", + "\n", + "output_tokens = inference_model.generate(\n", + " **batch,\n", + " max_new_tokens=256,\n", + " do_sample=True,\n", + " temperature=0.2,\n", + " top_p=0.95,\n", + " top_k=50,\n", + " eos_token_id=tokenizer.eos_token_id,\n", + " pad_token_id=tokenizer.pad_token_id,\n", + ")\n", + "\n", + "target_predicted = tokenizer.decode(output_tokens[0], skip_special_tokens=False).split(\"<|endcontext|>\")[1]\n", + "print(f\"{context=} \\n\\n {target_predicted=} \\n\\n {target=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd57f6e8-761f-4e0b-941c-f6973e13b186", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb b/peft/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..56e2e9fdfeb068998479370be27af66bb5cca070 --- /dev/null +++ b/peft/examples/causal_language_modeling/peft_prefix_tuning_clm.ipynb @@ -0,0 +1,1375 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "71fbfca2", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType, PeftType\n", + "import torch\n", + "from datasets import load_dataset\n", + "import os\n", + "from transformers import AutoTokenizer\n", + "from torch.utils.data import DataLoader\n", + "from transformers import default_data_collator, get_linear_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "model_name_or_path = \"bigscience/bloomz-560m\"\n", + "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n", + "peft_config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=30)\n", + "\n", + "dataset_name = \"twitter_complaints\"\n", + "checkpoint_name = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "text_column = \"Tweet text\"\n", + "label_column = \"text_label\"\n", + "max_length = 64\n", + "lr = 3e-2\n", + "num_epochs = 50\n", + "batch_size = 8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a3648b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset raft (/home/sourab/.cache/huggingface/datasets/ought___raft/twitter_complaints/1.1.0/79c4de1312c1e3730043f7db07179c914f48403101f7124e2fe336f6f54d9f84)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "56d9908a2c8944b484348cc46b16a261", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, base_model_name_or_path='bigscience/bloomz-560m', task_type=, inference_mode=False, num_virtual_tokens=30, token_dim=1024, num_transformer_submodules=1, num_attention_heads=16, num_layers=24, encoder_hidden_size=1024, prefix_projection=False)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.peft_config" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b2f91568", + "metadata": {}, + "outputs": [], + "source": [ + "# model\n", + "# optimizer and lr scheduler\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e4fb69fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00, 5.79it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.51it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0: train_ppl=tensor(1.8325e+09, device='cuda:0') train_epoch_loss=tensor(21.3289, device='cuda:0') eval_ppl=tensor(2713.4180, device='cuda:0') eval_epoch_loss=tensor(7.9060, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.53it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=1: train_ppl=tensor(341.0600, device='cuda:0') train_epoch_loss=tensor(5.8321, device='cuda:0') eval_ppl=tensor(80.8206, device='cuda:0') eval_epoch_loss=tensor(4.3922, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=2: train_ppl=tensor(59.8778, device='cuda:0') train_epoch_loss=tensor(4.0923, device='cuda:0') eval_ppl=tensor(34.4593, device='cuda:0') eval_epoch_loss=tensor(3.5398, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.55it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=3: train_ppl=tensor(22.3307, device='cuda:0') train_epoch_loss=tensor(3.1060, device='cuda:0') eval_ppl=tensor(12.5947, device='cuda:0') eval_epoch_loss=tensor(2.5333, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.56it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=4: train_ppl=tensor(9.1697, device='cuda:0') train_epoch_loss=tensor(2.2159, device='cuda:0') eval_ppl=tensor(4.5289, device='cuda:0') eval_epoch_loss=tensor(1.5105, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.52it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=5: train_ppl=tensor(3.0172, device='cuda:0') train_epoch_loss=tensor(1.1043, device='cuda:0') eval_ppl=tensor(1.8092, device='cuda:0') eval_epoch_loss=tensor(0.5929, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.45it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=6: train_ppl=tensor(1.4885, device='cuda:0') train_epoch_loss=tensor(0.3978, device='cuda:0') eval_ppl=tensor(1.4449, device='cuda:0') eval_epoch_loss=tensor(0.3680, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.48it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=7: train_ppl=tensor(1.2967, device='cuda:0') train_epoch_loss=tensor(0.2598, device='cuda:0') eval_ppl=tensor(1.1587, device='cuda:0') eval_epoch_loss=tensor(0.1473, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=8: train_ppl=tensor(1.1305, device='cuda:0') train_epoch_loss=tensor(0.1227, device='cuda:0') eval_ppl=tensor(1.0874, device='cuda:0') eval_epoch_loss=tensor(0.0838, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=9: train_ppl=tensor(1.1608, device='cuda:0') train_epoch_loss=tensor(0.1491, device='cuda:0') eval_ppl=tensor(1.1461, device='cuda:0') eval_epoch_loss=tensor(0.1364, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.45it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=10: train_ppl=tensor(1.3172, device='cuda:0') train_epoch_loss=tensor(0.2755, device='cuda:0') eval_ppl=tensor(1.1320, device='cuda:0') eval_epoch_loss=tensor(0.1240, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=11: train_ppl=tensor(1.1437, device='cuda:0') train_epoch_loss=tensor(0.1343, device='cuda:0') eval_ppl=tensor(1.0676, device='cuda:0') eval_epoch_loss=tensor(0.0654, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.43it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=12: train_ppl=tensor(1.0651, device='cuda:0') train_epoch_loss=tensor(0.0630, device='cuda:0') eval_ppl=tensor(1.0735, device='cuda:0') eval_epoch_loss=tensor(0.0710, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.46it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=13: train_ppl=tensor(1.0607, device='cuda:0') train_epoch_loss=tensor(0.0589, device='cuda:0') eval_ppl=tensor(1.0399, device='cuda:0') eval_epoch_loss=tensor(0.0391, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.44it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=14: train_ppl=tensor(1.0351, device='cuda:0') train_epoch_loss=tensor(0.0345, device='cuda:0') eval_ppl=tensor(1.0260, device='cuda:0') eval_epoch_loss=tensor(0.0257, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.43it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=15: train_ppl=tensor(1.0217, device='cuda:0') train_epoch_loss=tensor(0.0215, device='cuda:0') eval_ppl=tensor(1.0168, device='cuda:0') eval_epoch_loss=tensor(0.0167, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.28it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=16: train_ppl=tensor(1.0152, device='cuda:0') train_epoch_loss=tensor(0.0151, device='cuda:0') eval_ppl=tensor(1.0117, device='cuda:0') eval_epoch_loss=tensor(0.0116, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.41it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=17: train_ppl=tensor(1.0102, device='cuda:0') train_epoch_loss=tensor(0.0101, device='cuda:0') eval_ppl=tensor(1.0088, device='cuda:0') eval_epoch_loss=tensor(0.0088, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.29it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.25it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=18: train_ppl=tensor(1.0083, device='cuda:0') train_epoch_loss=tensor(0.0083, device='cuda:0') eval_ppl=tensor(1.0073, device='cuda:0') eval_epoch_loss=tensor(0.0073, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=19: train_ppl=tensor(1.0070, device='cuda:0') train_epoch_loss=tensor(0.0070, device='cuda:0') eval_ppl=tensor(1.0064, device='cuda:0') eval_epoch_loss=tensor(0.0063, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.51it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=20: train_ppl=tensor(1.0059, device='cuda:0') train_epoch_loss=tensor(0.0059, device='cuda:0') eval_ppl=tensor(1.0057, device='cuda:0') eval_epoch_loss=tensor(0.0057, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=21: train_ppl=tensor(1.0056, device='cuda:0') train_epoch_loss=tensor(0.0056, device='cuda:0') eval_ppl=tensor(1.0052, device='cuda:0') eval_epoch_loss=tensor(0.0052, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.41it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.33it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=22: train_ppl=tensor(1.0050, device='cuda:0') train_epoch_loss=tensor(0.0050, device='cuda:0') eval_ppl=tensor(1.0049, device='cuda:0') eval_epoch_loss=tensor(0.0049, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.39it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.44it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=23: train_ppl=tensor(1.0049, device='cuda:0') train_epoch_loss=tensor(0.0049, device='cuda:0') eval_ppl=tensor(1.0045, device='cuda:0') eval_epoch_loss=tensor(0.0045, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.42it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=24: train_ppl=tensor(1.0043, device='cuda:0') train_epoch_loss=tensor(0.0043, device='cuda:0') eval_ppl=tensor(1.0043, device='cuda:0') eval_epoch_loss=tensor(0.0043, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.46it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=25: train_ppl=tensor(1.0042, device='cuda:0') train_epoch_loss=tensor(0.0042, device='cuda:0') eval_ppl=tensor(1.0040, device='cuda:0') eval_epoch_loss=tensor(0.0040, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.52it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=26: train_ppl=tensor(1.0039, device='cuda:0') train_epoch_loss=tensor(0.0039, device='cuda:0') eval_ppl=tensor(1.0039, device='cuda:0') eval_epoch_loss=tensor(0.0039, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.48it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=27: train_ppl=tensor(1.0038, device='cuda:0') train_epoch_loss=tensor(0.0038, device='cuda:0') eval_ppl=tensor(1.0037, device='cuda:0') eval_epoch_loss=tensor(0.0037, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.46it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.54it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=28: train_ppl=tensor(1.0036, device='cuda:0') train_epoch_loss=tensor(0.0036, device='cuda:0') eval_ppl=tensor(1.0035, device='cuda:0') eval_epoch_loss=tensor(0.0035, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.53it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=29: train_ppl=tensor(1.0034, device='cuda:0') train_epoch_loss=tensor(0.0034, device='cuda:0') eval_ppl=tensor(1.0034, device='cuda:0') eval_epoch_loss=tensor(0.0034, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=30: train_ppl=tensor(1.0034, device='cuda:0') train_epoch_loss=tensor(0.0034, device='cuda:0') eval_ppl=tensor(1.0033, device='cuda:0') eval_epoch_loss=tensor(0.0033, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=31: train_ppl=tensor(1.0033, device='cuda:0') train_epoch_loss=tensor(0.0033, device='cuda:0') eval_ppl=tensor(1.0032, device='cuda:0') eval_epoch_loss=tensor(0.0032, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.46it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.51it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=32: train_ppl=tensor(1.0031, device='cuda:0') train_epoch_loss=tensor(0.0031, device='cuda:0') eval_ppl=tensor(1.0031, device='cuda:0') eval_epoch_loss=tensor(0.0031, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.43it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=33: train_ppl=tensor(1.0030, device='cuda:0') train_epoch_loss=tensor(0.0030, device='cuda:0') eval_ppl=tensor(1.0030, device='cuda:0') eval_epoch_loss=tensor(0.0030, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=34: train_ppl=tensor(1.0029, device='cuda:0') train_epoch_loss=tensor(0.0029, device='cuda:0') eval_ppl=tensor(1.0029, device='cuda:0') eval_epoch_loss=tensor(0.0029, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=35: train_ppl=tensor(1.0028, device='cuda:0') train_epoch_loss=tensor(0.0028, device='cuda:0') eval_ppl=tensor(1.0029, device='cuda:0') eval_epoch_loss=tensor(0.0029, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.45it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=36: train_ppl=tensor(1.0027, device='cuda:0') train_epoch_loss=tensor(0.0027, device='cuda:0') eval_ppl=tensor(1.0028, device='cuda:0') eval_epoch_loss=tensor(0.0028, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=37: train_ppl=tensor(1.0027, device='cuda:0') train_epoch_loss=tensor(0.0027, device='cuda:0') eval_ppl=tensor(1.0027, device='cuda:0') eval_epoch_loss=tensor(0.0027, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.45it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=38: train_ppl=tensor(1.0027, device='cuda:0') train_epoch_loss=tensor(0.0027, device='cuda:0') eval_ppl=tensor(1.0027, device='cuda:0') eval_epoch_loss=tensor(0.0027, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.43it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=39: train_ppl=tensor(1.0025, device='cuda:0') train_epoch_loss=tensor(0.0025, device='cuda:0') eval_ppl=tensor(1.0026, device='cuda:0') eval_epoch_loss=tensor(0.0026, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=40: train_ppl=tensor(1.0026, device='cuda:0') train_epoch_loss=tensor(0.0026, device='cuda:0') eval_ppl=tensor(1.0026, device='cuda:0') eval_epoch_loss=tensor(0.0026, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.33it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=41: train_ppl=tensor(1.0025, device='cuda:0') train_epoch_loss=tensor(0.0025, device='cuda:0') eval_ppl=tensor(1.0025, device='cuda:0') eval_epoch_loss=tensor(0.0025, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.42it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=42: train_ppl=tensor(1.0024, device='cuda:0') train_epoch_loss=tensor(0.0024, device='cuda:0') eval_ppl=tensor(1.0025, device='cuda:0') eval_epoch_loss=tensor(0.0025, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=43: train_ppl=tensor(1.0024, device='cuda:0') train_epoch_loss=tensor(0.0024, device='cuda:0') eval_ppl=tensor(1.0025, device='cuda:0') eval_epoch_loss=tensor(0.0025, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.44it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.43it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=44: train_ppl=tensor(1.0025, device='cuda:0') train_epoch_loss=tensor(0.0024, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.50it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=45: train_ppl=tensor(1.0024, device='cuda:0') train_epoch_loss=tensor(0.0024, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.43it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=46: train_ppl=tensor(1.0024, device='cuda:0') train_epoch_loss=tensor(0.0024, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.42it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.39it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=47: train_ppl=tensor(1.0023, device='cuda:0') train_epoch_loss=tensor(0.0023, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.40it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 22.40it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=48: train_ppl=tensor(1.0023, device='cuda:0') train_epoch_loss=tensor(0.0023, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11.41it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.87it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=49: train_ppl=tensor(1.0023, device='cuda:0') train_epoch_loss=tensor(0.0023, device='cuda:0') eval_ppl=tensor(1.0024, device='cuda:0') eval_epoch_loss=tensor(0.0024, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# training and evaluation\n", + "model = model.to(device)\n", + "\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for step, batch in enumerate(tqdm(train_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " # print(batch)\n", + " # print(batch[\"input_ids\"].shape)\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " total_loss += loss.detach().float()\n", + " loss.backward()\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + "\n", + " model.eval()\n", + " eval_loss = 0\n", + " eval_preds = []\n", + " for step, batch in enumerate(tqdm(eval_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " with torch.no_grad():\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " eval_loss += loss.detach().float()\n", + " eval_preds.extend(\n", + " tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n", + " )\n", + "\n", + " eval_epoch_loss = eval_loss / len(eval_dataloader)\n", + " eval_ppl = torch.exp(eval_epoch_loss)\n", + " train_epoch_loss = total_loss / len(train_dataloader)\n", + " train_ppl = torch.exp(train_epoch_loss)\n", + " print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "53752a7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hey @nytimes your link to cancel my subscription isn't working and nobody is answering the chat. Please don't play that kind of stupid game.\n", + "{'input_ids': tensor([[227985, 5484, 915, 54078, 2566, 7782, 24502, 2632, 8989,\n", + " 427, 36992, 2670, 140711, 21994, 10789, 530, 88399, 632,\n", + " 183542, 368, 44799, 17, 29901, 5926, 7229, 861, 11596,\n", + " 461, 78851, 14775, 17, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 54078, 2566, 7782, 24502, 2632, 8989,\n", + " 427, 36992, 2670, 140711, 21994, 10789, 530, 88399, 632,\n", + " 183542, 368, 44799, 17, 29901, 5926, 7229, 861, 11596,\n", + " 461, 78851, 14775, 17, 77658, 915, 210, 16449, 5952,\n", + " 3]], device='cuda:0')\n", + "[\"Tweet text : Hey @nytimes your link to cancel my subscription isn't working and nobody is answering the chat. Please don't play that kind of stupid game. Label : complaint\"]\n" + ] + } + ], + "source": [ + "model.eval()\n", + "i = 16\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "0e21c49b", + "metadata": {}, + "source": [ + "You can push model to hub or save model locally. \n", + "\n", + "- Option1: Pushing the model to Hugging Face Hub\n", + "```python\n", + "model.push_to_hub(\n", + " f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n", + " token = \"hf_...\"\n", + ")\n", + "```\n", + "token (`bool` or `str`, *optional*):\n", + " `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n", + " when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n", + " is not specified.\n", + " Or you can get your token from https://huggingface.co/settings/token\n", + "```\n", + "- Or save model locally\n", + "```python\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n", + "model.save_pretrained(peft_model_id)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "24041ee1", + "metadata": {}, + "outputs": [], + "source": [ + "# saving model\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "model.save_pretrained(peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "527eeaa4", + "metadata": {}, + "outputs": [], + "source": [ + "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n", + "!du -h $ckpt" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b19f5a90", + "metadata": {}, + "outputs": [], + "source": [ + "from peft import PeftModel, PeftConfig\n", + "\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "\n", + "config = PeftConfig.from_pretrained(peft_model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n", + "model = PeftModel.from_pretrained(model, peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a11a3768", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@greateranglia Ok thanks...\n", + "{'input_ids': tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210, 1936, 106863, 3]],\n", + " device='cuda:0')\n", + "['Tweet text : @greateranglia Ok thanks... Label : no complaint']\n" + ] + } + ], + "source": [ + "model.to(device)\n", + "model.eval()\n", + "i = 4\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f890c951", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "463a41a2", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c60c7a9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb b/peft/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..47448713f8ba22e6225377fc03e77ad10c29871c --- /dev/null +++ b/peft/examples/causal_language_modeling/peft_prompt_tuning_clm.ipynb @@ -0,0 +1,1220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "71fbfca2", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType\n", + "import torch\n", + "from datasets import load_dataset\n", + "import os\n", + "from transformers import AutoTokenizer\n", + "from torch.utils.data import DataLoader\n", + "from transformers import default_data_collator, get_linear_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "model_name_or_path = \"bigscience/bloomz-560m\"\n", + "tokenizer_name_or_path = \"bigscience/bloomz-560m\"\n", + "peft_config = PromptTuningConfig(\n", + " task_type=TaskType.CAUSAL_LM,\n", + " prompt_tuning_init=PromptTuningInit.TEXT,\n", + " num_virtual_tokens=8,\n", + " prompt_tuning_init_text=\"Classify if the tweet is a complaint or not:\",\n", + " tokenizer_name_or_path=model_name_or_path,\n", + ")\n", + "\n", + "dataset_name = \"twitter_complaints\"\n", + "checkpoint_name = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "text_column = \"Tweet text\"\n", + "label_column = \"text_label\"\n", + "max_length = 64\n", + "lr = 3e-2\n", + "num_epochs = 50\n", + "batch_size = 8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1a3648b", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_dataset(\n", + " \"parquet\",\n", + " data_files={\n", + " \"train\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/train/0000.parquet\",\n", + " \"test\": f\"hf://datasets/ought/raft@refs/convert/parquet/{dataset_name}/test/0000.parquet\"\n", + " }\n", + ")\n", + "\n", + "classes = [k.replace(\"_\", \" \") for k in dataset[\"train\"].features[\"Label\"].names]\n", + "print(classes)\n", + "dataset = dataset.map(\n", + " lambda x: {\"text_label\": [classes[label] for label in x[\"Label\"]]},\n", + " batched=True,\n", + " num_proc=1,\n", + ")\n", + "print(dataset)\n", + "dataset[\"train\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe12d4d3", + "metadata": {}, + "outputs": [], + "source": [ + "# data preprocessing\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", + "if tokenizer.pad_token_id is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id\n", + "target_max_length = max([len(tokenizer(class_label)[\"input_ids\"]) for class_label in classes])\n", + "print(target_max_length)\n", + "\n", + "\n", + "def preprocess_function(examples):\n", + " batch_size = len(examples[text_column])\n", + " inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n", + " targets = [str(x) for x in examples[label_column]]\n", + " model_inputs = tokenizer(inputs)\n", + " labels = tokenizer(targets, add_special_tokens=False) # don't add bos token because we concatenate with inputs\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " label_input_ids = labels[\"input_ids\"][i] + [tokenizer.eos_token_id]\n", + " # print(i, sample_input_ids, label_input_ids)\n", + " model_inputs[\"input_ids\"][i] = sample_input_ids + label_input_ids\n", + " labels[\"input_ids\"][i] = [-100] * len(sample_input_ids) + label_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [1] * len(model_inputs[\"input_ids\"][i])\n", + " # print(model_inputs)\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " label_input_ids = labels[\"input_ids\"][i]\n", + " model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n", + " max_length - len(sample_input_ids)\n", + " ) + sample_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n", + " \"attention_mask\"\n", + " ][i]\n", + " labels[\"input_ids\"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids\n", + " model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n", + " labels[\"input_ids\"][i] = torch.tensor(labels[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", + " return model_inputs\n", + "\n", + "\n", + "processed_datasets = dataset.map(\n", + " preprocess_function,\n", + " batched=True,\n", + " num_proc=1,\n", + " remove_columns=dataset[\"train\"].column_names,\n", + " load_from_cache_file=False,\n", + " desc=\"Running tokenizer on dataset\",\n", + ")\n", + "\n", + "train_dataset = processed_datasets[\"train\"]\n", + "eval_dataset = processed_datasets[\"train\"]\n", + "\n", + "\n", + "train_dataloader = DataLoader(\n", + " train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True\n", + ")\n", + "eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "641b21fe", + "metadata": {}, + "outputs": [], + "source": [ + "def test_preprocess_function(examples):\n", + " batch_size = len(examples[text_column])\n", + " inputs = [f\"{text_column} : {x} Label : \" for x in examples[text_column]]\n", + " model_inputs = tokenizer(inputs)\n", + " # print(model_inputs)\n", + " for i in range(batch_size):\n", + " sample_input_ids = model_inputs[\"input_ids\"][i]\n", + " model_inputs[\"input_ids\"][i] = [tokenizer.pad_token_id] * (\n", + " max_length - len(sample_input_ids)\n", + " ) + sample_input_ids\n", + " model_inputs[\"attention_mask\"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[\n", + " \"attention_mask\"\n", + " ][i]\n", + " model_inputs[\"input_ids\"][i] = torch.tensor(model_inputs[\"input_ids\"][i][:max_length])\n", + " model_inputs[\"attention_mask\"][i] = torch.tensor(model_inputs[\"attention_mask\"][i][:max_length])\n", + " return model_inputs\n", + "\n", + "\n", + "test_dataset = dataset[\"test\"].map(\n", + " test_preprocess_function,\n", + " batched=True,\n", + " num_proc=1,\n", + " remove_columns=dataset[\"train\"].column_names,\n", + " load_from_cache_file=False,\n", + " desc=\"Running tokenizer on dataset\",\n", + ")\n", + "\n", + "test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)\n", + "next(iter(test_dataloader))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "accc5012", + "metadata": {}, + "outputs": [], + "source": [ + "next(iter(train_dataloader))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "218df807", + "metadata": {}, + "outputs": [], + "source": [ + "len(test_dataloader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47d1fedf", + "metadata": {}, + "outputs": [], + "source": [ + "next(iter(test_dataloader))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a773e092", + "metadata": {}, + "outputs": [], + "source": [ + "# creating model\n", + "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b2f91568", + "metadata": {}, + "outputs": [], + "source": [ + "# model\n", + "# optimizer and lr scheduler\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=lr)\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0,\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e4fb69fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00, 5.68it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.48it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0: train_ppl=tensor(2.2720e+13, device='cuda:0') train_epoch_loss=tensor(30.7543, device='cuda:0') eval_ppl=tensor(483597.5625, device='cuda:0') eval_epoch_loss=tensor(13.0890, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.91it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 20.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=1: train_ppl=tensor(452658.3750, device='cuda:0') train_epoch_loss=tensor(13.0229, device='cuda:0') eval_ppl=tensor(275088.1875, device='cuda:0') eval_epoch_loss=tensor(12.5248, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.90it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.41it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=2: train_ppl=tensor(199203.3906, device='cuda:0') train_epoch_loss=tensor(12.2021, device='cuda:0') eval_ppl=tensor(143637.0312, device='cuda:0') eval_epoch_loss=tensor(11.8750, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.92it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.31it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=3: train_ppl=tensor(114743.9531, device='cuda:0') train_epoch_loss=tensor(11.6505, device='cuda:0') eval_ppl=tensor(54962., device='cuda:0') eval_epoch_loss=tensor(10.9144, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.81it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.34it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=4: train_ppl=tensor(40786.5977, device='cuda:0') train_epoch_loss=tensor(10.6161, device='cuda:0') eval_ppl=tensor(18342.5430, device='cuda:0') eval_epoch_loss=tensor(9.8170, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.89it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.34it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=5: train_ppl=tensor(14023.0830, device='cuda:0') train_epoch_loss=tensor(9.5485, device='cuda:0') eval_ppl=tensor(6316.8540, device='cuda:0') eval_epoch_loss=tensor(8.7510, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.84it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.32it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=6: train_ppl=tensor(5635.3262, device='cuda:0') train_epoch_loss=tensor(8.6368, device='cuda:0') eval_ppl=tensor(2476.5776, device='cuda:0') eval_epoch_loss=tensor(7.8146, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.88it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.30it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=7: train_ppl=tensor(1818.4940, device='cuda:0') train_epoch_loss=tensor(7.5058, device='cuda:0') eval_ppl=tensor(934.1146, device='cuda:0') eval_epoch_loss=tensor(6.8396, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.05it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.97it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=8: train_ppl=tensor(645.2143, device='cuda:0') train_epoch_loss=tensor(6.4696, device='cuda:0') eval_ppl=tensor(361.9093, device='cuda:0') eval_epoch_loss=tensor(5.8914, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.67it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 19.12it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=9: train_ppl=tensor(293.8047, device='cuda:0') train_epoch_loss=tensor(5.6829, device='cuda:0') eval_ppl=tensor(215.8185, device='cuda:0') eval_epoch_loss=tensor(5.3744, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.54it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 20.83it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=10: train_ppl=tensor(191.2377, device='cuda:0') train_epoch_loss=tensor(5.2535, device='cuda:0') eval_ppl=tensor(177.1512, device='cuda:0') eval_epoch_loss=tensor(5.1770, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.02it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.98it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=11: train_ppl=tensor(153.6052, device='cuda:0') train_epoch_loss=tensor(5.0344, device='cuda:0') eval_ppl=tensor(126.6154, device='cuda:0') eval_epoch_loss=tensor(4.8412, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.54it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.78it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=12: train_ppl=tensor(122.8925, device='cuda:0') train_epoch_loss=tensor(4.8113, device='cuda:0') eval_ppl=tensor(97.3331, device='cuda:0') eval_epoch_loss=tensor(4.5781, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.66it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 19.72it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=13: train_ppl=tensor(84.8845, device='cuda:0') train_epoch_loss=tensor(4.4413, device='cuda:0') eval_ppl=tensor(70.3213, device='cuda:0') eval_epoch_loss=tensor(4.2531, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00, 6.73it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 16.07it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=14: train_ppl=tensor(64.6705, device='cuda:0') train_epoch_loss=tensor(4.1693, device='cuda:0') eval_ppl=tensor(50.4688, device='cuda:0') eval_epoch_loss=tensor(3.9214, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.41it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.63it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=15: train_ppl=tensor(44.2937, device='cuda:0') train_epoch_loss=tensor(3.7908, device='cuda:0') eval_ppl=tensor(34.8210, device='cuda:0') eval_epoch_loss=tensor(3.5502, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.31it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=16: train_ppl=tensor(30.0995, device='cuda:0') train_epoch_loss=tensor(3.4045, device='cuda:0') eval_ppl=tensor(24.7703, device='cuda:0') eval_epoch_loss=tensor(3.2096, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.31it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.59it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=17: train_ppl=tensor(23.3086, device='cuda:0') train_epoch_loss=tensor(3.1488, device='cuda:0') eval_ppl=tensor(20.8131, device='cuda:0') eval_epoch_loss=tensor(3.0356, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.29it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 16.04it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=18: train_ppl=tensor(16.4479, device='cuda:0') train_epoch_loss=tensor(2.8002, device='cuda:0') eval_ppl=tensor(12.0876, device='cuda:0') eval_epoch_loss=tensor(2.4922, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.37it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.37it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=19: train_ppl=tensor(11.1977, device='cuda:0') train_epoch_loss=tensor(2.4157, device='cuda:0') eval_ppl=tensor(9.0399, device='cuda:0') eval_epoch_loss=tensor(2.2016, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.23it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 17.29it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=20: train_ppl=tensor(8.1847, device='cuda:0') train_epoch_loss=tensor(2.1023, device='cuda:0') eval_ppl=tensor(6.7486, device='cuda:0') eval_epoch_loss=tensor(1.9093, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.30it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.58it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=21: train_ppl=tensor(6.1145, device='cuda:0') train_epoch_loss=tensor(1.8107, device='cuda:0') eval_ppl=tensor(5.5931, device='cuda:0') eval_epoch_loss=tensor(1.7215, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.34it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.36it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=22: train_ppl=tensor(5.2963, device='cuda:0') train_epoch_loss=tensor(1.6670, device='cuda:0') eval_ppl=tensor(5.0573, device='cuda:0') eval_epoch_loss=tensor(1.6208, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.84it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.26it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=23: train_ppl=tensor(4.7485, device='cuda:0') train_epoch_loss=tensor(1.5578, device='cuda:0') eval_ppl=tensor(3.6277, device='cuda:0') eval_epoch_loss=tensor(1.2886, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.84it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.31it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=24: train_ppl=tensor(3.4080, device='cuda:0') train_epoch_loss=tensor(1.2261, device='cuda:0') eval_ppl=tensor(3.0467, device='cuda:0') eval_epoch_loss=tensor(1.1141, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.88it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.25it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=25: train_ppl=tensor(3.3052, device='cuda:0') train_epoch_loss=tensor(1.1955, device='cuda:0') eval_ppl=tensor(2.7784, device='cuda:0') eval_epoch_loss=tensor(1.0219, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.86it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.22it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=26: train_ppl=tensor(2.9487, device='cuda:0') train_epoch_loss=tensor(1.0814, device='cuda:0') eval_ppl=tensor(2.9471, device='cuda:0') eval_epoch_loss=tensor(1.0808, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.85it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.25it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=27: train_ppl=tensor(2.8738, device='cuda:0') train_epoch_loss=tensor(1.0556, device='cuda:0') eval_ppl=tensor(2.5801, device='cuda:0') eval_epoch_loss=tensor(0.9478, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.84it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.28it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=28: train_ppl=tensor(2.3241, device='cuda:0') train_epoch_loss=tensor(0.8433, device='cuda:0') eval_ppl=tensor(2.2198, device='cuda:0') eval_epoch_loss=tensor(0.7974, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.84it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 20.89it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=29: train_ppl=tensor(2.0376, device='cuda:0') train_epoch_loss=tensor(0.7118, device='cuda:0') eval_ppl=tensor(1.8572, device='cuda:0') eval_epoch_loss=tensor(0.6191, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.76it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.83it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=30: train_ppl=tensor(1.8301, device='cuda:0') train_epoch_loss=tensor(0.6044, device='cuda:0') eval_ppl=tensor(1.8864, device='cuda:0') eval_epoch_loss=tensor(0.6347, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.80it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 19.81it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=31: train_ppl=tensor(1.7301, device='cuda:0') train_epoch_loss=tensor(0.5482, device='cuda:0') eval_ppl=tensor(1.6340, device='cuda:0') eval_epoch_loss=tensor(0.4910, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.60it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 19.11it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=32: train_ppl=tensor(1.5842, device='cuda:0') train_epoch_loss=tensor(0.4601, device='cuda:0') eval_ppl=tensor(1.6179, device='cuda:0') eval_epoch_loss=tensor(0.4811, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.11it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.35it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=33: train_ppl=tensor(1.5193, device='cuda:0') train_epoch_loss=tensor(0.4183, device='cuda:0') eval_ppl=tensor(1.5543, device='cuda:0') eval_epoch_loss=tensor(0.4410, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.59it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 18.60it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=34: train_ppl=tensor(1.5402, device='cuda:0') train_epoch_loss=tensor(0.4319, device='cuda:0') eval_ppl=tensor(1.4924, device='cuda:0') eval_epoch_loss=tensor(0.4004, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 9.80it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 19.63it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=35: train_ppl=tensor(1.4410, device='cuda:0') train_epoch_loss=tensor(0.3654, device='cuda:0') eval_ppl=tensor(1.3888, device='cuda:0') eval_epoch_loss=tensor(0.3284, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00, 6.60it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.36it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=36: train_ppl=tensor(1.3675, device='cuda:0') train_epoch_loss=tensor(0.3130, device='cuda:0') eval_ppl=tensor(1.4001, device='cuda:0') eval_epoch_loss=tensor(0.3366, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.40it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.58it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=37: train_ppl=tensor(1.4197, device='cuda:0') train_epoch_loss=tensor(0.3505, device='cuda:0') eval_ppl=tensor(1.3214, device='cuda:0') eval_epoch_loss=tensor(0.2787, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.27it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.56it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=38: train_ppl=tensor(1.3855, device='cuda:0') train_epoch_loss=tensor(0.3261, device='cuda:0') eval_ppl=tensor(1.3501, device='cuda:0') eval_epoch_loss=tensor(0.3001, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.25it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=39: train_ppl=tensor(1.3643, device='cuda:0') train_epoch_loss=tensor(0.3107, device='cuda:0') eval_ppl=tensor(1.3549, device='cuda:0') eval_epoch_loss=tensor(0.3037, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.28it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.41it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=40: train_ppl=tensor(1.3093, device='cuda:0') train_epoch_loss=tensor(0.2695, device='cuda:0') eval_ppl=tensor(1.3233, device='cuda:0') eval_epoch_loss=tensor(0.2801, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.24it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.51it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=41: train_ppl=tensor(1.3108, device='cuda:0') train_epoch_loss=tensor(0.2706, device='cuda:0') eval_ppl=tensor(1.3440, device='cuda:0') eval_epoch_loss=tensor(0.2957, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.78it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.61it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=42: train_ppl=tensor(1.2944, device='cuda:0') train_epoch_loss=tensor(0.2581, device='cuda:0') eval_ppl=tensor(1.2711, device='cuda:0') eval_epoch_loss=tensor(0.2399, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8.29it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 15.56it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=43: train_ppl=tensor(1.2616, device='cuda:0') train_epoch_loss=tensor(0.2323, device='cuda:0') eval_ppl=tensor(1.2449, device='cuda:0') eval_epoch_loss=tensor(0.2190, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.85it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.27it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=44: train_ppl=tensor(1.2478, device='cuda:0') train_epoch_loss=tensor(0.2214, device='cuda:0') eval_ppl=tensor(1.2202, device='cuda:0') eval_epoch_loss=tensor(0.1990, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.85it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.31it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=45: train_ppl=tensor(1.2350, device='cuda:0') train_epoch_loss=tensor(0.2111, device='cuda:0') eval_ppl=tensor(1.2180, device='cuda:0') eval_epoch_loss=tensor(0.1972, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.86it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.33it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=46: train_ppl=tensor(1.2277, device='cuda:0') train_epoch_loss=tensor(0.2052, device='cuda:0') eval_ppl=tensor(1.2077, device='cuda:0') eval_epoch_loss=tensor(0.1887, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.87it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.35it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=47: train_ppl=tensor(1.2037, device='cuda:0') train_epoch_loss=tensor(0.1854, device='cuda:0') eval_ppl=tensor(1.2041, device='cuda:0') eval_epoch_loss=tensor(0.1857, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.83it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.29it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=48: train_ppl=tensor(1.2026, device='cuda:0') train_epoch_loss=tensor(0.1845, device='cuda:0') eval_ppl=tensor(1.1982, device='cuda:0') eval_epoch_loss=tensor(0.1808, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 10.86it/s]\n", + "100%|████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.35it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=49: train_ppl=tensor(1.2005, device='cuda:0') train_epoch_loss=tensor(0.1827, device='cuda:0') eval_ppl=tensor(1.1968, device='cuda:0') eval_epoch_loss=tensor(0.1796, device='cuda:0')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "# training and evaluation\n", + "model = model.to(device)\n", + "\n", + "for epoch in range(num_epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for step, batch in enumerate(tqdm(train_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " # print(batch)\n", + " # print(batch[\"input_ids\"].shape)\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " total_loss += loss.detach().float()\n", + " loss.backward()\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + "\n", + " model.eval()\n", + " eval_loss = 0\n", + " eval_preds = []\n", + " for step, batch in enumerate(tqdm(eval_dataloader)):\n", + " batch = {k: v.to(device) for k, v in batch.items()}\n", + " with torch.no_grad():\n", + " outputs = model(**batch)\n", + " loss = outputs.loss\n", + " eval_loss += loss.detach().float()\n", + " eval_preds.extend(\n", + " tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)\n", + " )\n", + "\n", + " eval_epoch_loss = eval_loss / len(eval_dataloader)\n", + " eval_ppl = torch.exp(eval_epoch_loss)\n", + " train_epoch_loss = total_loss / len(train_dataloader)\n", + " train_ppl = torch.exp(train_epoch_loss)\n", + " print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "53752a7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand > different sizing\n", + "{'input_ids': tensor([[227985, 5484, 915, 2566, 226154, 126015, 5385, 259, 239364,\n", + " 3396, 70823, 5853, 17, 57247, 1231, 191040, 5025, 7869,\n", + " 375, 2324, 149349, 12, 415, 122321, 897, 415, 10136,\n", + " 10021, 897, 415, 10136, 6497, 381, 915, 5025, 51950,\n", + " 66869, 5955, 272, 20311, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 2566, 226154, 126015, 5385, 259, 239364,\n", + " 3396, 70823, 5853, 17, 57247, 1231, 191040, 5025, 7869,\n", + " 375, 2324, 149349, 12, 415, 122321, 897, 415, 10136,\n", + " 10021, 897, 415, 10136, 6497, 381, 915, 5025, 51950,\n", + " 66869, 5955, 272, 20311, 77658, 915, 210, 16449, 5952,\n", + " 3]], device='cuda:0')\n", + "['Tweet text : @TommyHilfiger Dramatic shopping exp. ordered 6 jeans same size (30/32) 2 fits / 2 too large / 2 too slim : same brand > different sizing Label : complaint']\n" + ] + } + ], + "source": [ + "model.eval()\n", + "i = 33\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "c8f35152", + "metadata": {}, + "source": [ + "You can push model to hub or save model locally. \n", + "\n", + "- Option1: Pushing the model to Hugging Face Hub\n", + "```python\n", + "model.push_to_hub(\n", + " f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\"),\n", + " token = \"hf_...\"\n", + ")\n", + "```\n", + "token (`bool` or `str`, *optional*):\n", + " `token` is to be used for HTTP Bearer authorization when accessing remote files. If `True`, will use the token generated\n", + " when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`\n", + " is not specified.\n", + " Or you can get your token from https://huggingface.co/settings/token\n", + "```\n", + "- Or save model locally\n", + "```python\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\"/\", \"_\")\n", + "model.save_pretrained(peft_model_id)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d8ba1f8c", + "metadata": {}, + "outputs": [], + "source": [ + "# saving model\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "model.save_pretrained(peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4928c7f1", + "metadata": {}, + "outputs": [], + "source": [ + "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n", + "!du -h $ckpt" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4d9476e1", + "metadata": {}, + "outputs": [], + "source": [ + "from peft import PeftModel, PeftConfig\n", + "\n", + "peft_model_id = f\"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\".replace(\n", + " \"/\", \"_\"\n", + ")\n", + "\n", + "config = PeftConfig.from_pretrained(peft_model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)\n", + "model = PeftModel.from_pretrained(model, peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ebe174a6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@greateranglia Ok thanks...\n", + "{'input_ids': tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[227985, 5484, 915, 2566, 14173, 2960, 29906, 387, 20706,\n", + " 49337, 1369, 77658, 915, 210, 1936, 106863, 3]],\n", + " device='cuda:0')\n", + "['Tweet text : @greateranglia Ok thanks... Label : no complaint']\n" + ] + } + ], + "source": [ + "model.to(device)\n", + "model.eval()\n", + "i = 4\n", + "inputs = tokenizer(f'{text_column} : {dataset[\"test\"][i][\"Tweet text\"]} Label : ', return_tensors=\"pt\")\n", + "print(dataset[\"test\"][i][\"Tweet text\"])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " inputs = {k: v.to(device) for k, v in inputs.items()}\n", + " outputs = model.generate(\n", + " input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_new_tokens=10, eos_token_id=3\n", + " )\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24041ee1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/causal_language_modeling/requirements.txt b/peft/examples/causal_language_modeling/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e462c3c63b9067a36e44e14a092b52adef3e849 --- /dev/null +++ b/peft/examples/causal_language_modeling/requirements.txt @@ -0,0 +1,7 @@ +transformers<4.54.0 +accelerate +evaluate +deepspeed +tqdm +dataclass-csv +datasets==3.6.0 \ No newline at end of file diff --git a/peft/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml b/peft/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a4a0bcfaf09bd632e6ac2152c8a4f30e183cc102 --- /dev/null +++ b/peft/examples/conditional_generation/accelerate_ds_zero3_cpu_offload_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +deepspeed_config: + gradient_accumulation_steps: 1 + gradient_clipping: 1.0 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +dynamo_backend: 'NO' +fsdp_config: {} +machine_rank: 0 +main_training_function: main +megatron_lm_config: {} +mixed_precision: 'no' +num_machines: 1 +num_processes: 1 +rdzv_backend: static +same_network: true +use_cpu: false \ No newline at end of file diff --git a/peft/examples/conditional_generation/multitask_prompt_tuning.ipynb b/peft/examples/conditional_generation/multitask_prompt_tuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1eaec5b016ecada5f7b3a4d71902665b1759f7a3 --- /dev/null +++ b/peft/examples/conditional_generation/multitask_prompt_tuning.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "58ff91ca-ce92-43d0-ae8b-4e9e89e193f6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from datasets import load_dataset\n", + "from transformers import set_seed, AutoModelForSeq2SeqLM, AutoTokenizer\n", + "from peft import get_peft_model, MultitaskPromptTuningConfig, TaskType, MultitaskPromptTuningInit\n", + "\n", + "set_seed(42)\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "model_name = \"google/flan-t5-base\"\n", + "\n", + "peft_config = MultitaskPromptTuningConfig(\n", + " tokenizer_name_or_path=model_name,\n", + " num_tasks=2,\n", + " task_type=TaskType.SEQ_2_SEQ_LM,\n", + " prompt_tuning_init=MultitaskPromptTuningInit.TEXT,\n", + " num_virtual_tokens=50,\n", + " num_transformer_submodules=1,\n", + " prompt_tuning_init_text=\"classify the following into either positive or negative, or entailment, neutral or contradiction:\",\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n", + "model = get_peft_model(model, peft_config)\n", + "\n", + "model = model.to(device)\n", + "\n", + "\n", + "def send_to_device(batch):\n", + " for i in batch:\n", + " batch[i] = batch[i].to(device)\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "eb112bc1-ffaf-49fa-a216-0d601ec304ee", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def get_sst2(split: str):\n", + " examples = load_dataset(\"sst2\")[split]\n", + " result_examples = []\n", + " for example in examples:\n", + " result_examples.append({})\n", + "\n", + " result_examples[-1][\"input\"] = example[\"sentence\"].strip() + \"\"\n", + " result_examples[-1][\"output\"] = (\n", + " f\"positive{tokenizer.eos_token}\" if example[\"label\"] == 1 else f\"negative{tokenizer.eos_token}\"\n", + " )\n", + " result_examples[-1][\"task_id\"] = 0\n", + "\n", + " return result_examples\n", + "\n", + "\n", + "def get_mnli(split: str):\n", + " examples = load_dataset(\"multi_nli\")[split]\n", + " result_examples = []\n", + " for example in examples:\n", + " result_examples.append({})\n", + "\n", + " result_examples[-1][\"input\"] = example[\"premise\"].strip() + \" \" + example[\"hypothesis\"].strip() + \"\"\n", + "\n", + " if example[\"label\"] == 0:\n", + " result_examples[-1][\"output\"] = f\"entailment{tokenizer.eos_token}\"\n", + " elif example[\"label\"] == 1:\n", + " result_examples[-1][\"output\"] = f\"neutral{tokenizer.eos_token}\"\n", + " else:\n", + " result_examples[-1][\"output\"] = f\"contradiction{tokenizer.eos_token}\"\n", + "\n", + " result_examples[-1][\"task_id\"] = 1\n", + "\n", + " return result_examples" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e5a16ec4-8fef-4ba9-95b6-a661eb51e50c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import Tuple\n", + "from torch.utils.data import Dataset, DataLoader\n", + "import torch\n", + "\n", + "\n", + "class MyDataset(Dataset):\n", + " def __init__(self, split: str, mode: str = \"source\") -> None:\n", + " super().__init__()\n", + "\n", + " if split == \"train\":\n", + " if mode == \"source\":\n", + " self.examples = get_sst2(split) + get_mnli(split)\n", + " elif mode == \"target\":\n", + " self.examples = get_sst2(split)\n", + " if split == \"val\":\n", + " self.examples = get_sst2(\"validation\")\n", + " if split == \"test\":\n", + " self.examples = get_sst2(\"validation\")\n", + "\n", + " def __getitem__(self, index) -> dict:\n", + " return self.examples[index]\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.examples)\n", + "\n", + " def __getitem__(self, index) -> dict:\n", + " return self.examples[index]\n", + "\n", + " def __len__(self) -> int:\n", + " return len(self.examples)\n", + "\n", + "\n", + "def collate_fn(batch: dict) -> Tuple[torch.Tensor, torch.Tensor]:\n", + " input = [i[\"input\"] for i in batch]\n", + " input = tokenizer(input, add_special_tokens=False, return_tensors=\"pt\", padding=True)\n", + "\n", + " output = [i[\"output\"] for i in batch]\n", + " output = tokenizer(output, add_special_tokens=False, return_tensors=\"pt\", padding=True).input_ids\n", + " output[output == tokenizer.pad_token_id] = -100\n", + "\n", + " task_ids = [i[\"task_id\"] for i in batch]\n", + " task_ids = torch.tensor(task_ids)\n", + "\n", + " return {\n", + " \"input_ids\": input.input_ids,\n", + " \"attention_mask\": input.attention_mask,\n", + " \"labels\": output,\n", + " \"task_ids\": task_ids,\n", + " }\n", + "\n", + "\n", + "train = DataLoader(MyDataset(\"train\"), shuffle=True, batch_size=8, collate_fn=collate_fn)\n", + "val = DataLoader(MyDataset(\"val\"), shuffle=False, batch_size=8, collate_fn=collate_fn)\n", + "test = DataLoader(MyDataset(\"test\"), shuffle=False, batch_size=8, collate_fn=collate_fn)" + ] + }, + { + "cell_type": "markdown", + "id": "fe0aec7b-f61e-4b00-a90e-c1201dc1f84c", + "metadata": {}, + "source": [ + "## source training" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cceecc94-f43a-4f62-8d45-926f2f02f36d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from torch.optim.adamw import AdamW\n", + "from transformers import get_cosine_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "from sklearn.metrics import f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eae5516b-73ab-44a8-a083-4e8de6127f30", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "POSITIVE_TOKEN_ID = tokenizer(\" positive\", add_special_tokens=False)[\"input_ids\"][0]\n", + "NEGATIVE_TOKEN_ID = tokenizer(\" negative\", add_special_tokens=False)[\"input_ids\"][0]\n", + "\n", + "\n", + "def classify(batch):\n", + " batch = send_to_device(batch)\n", + " # we pass labels here since we need to generate and peft doesn't support generation yet.\n", + " # No clue how to get around this\n", + " scores = model(**batch).logits\n", + " preds = []\n", + " for i in range(scores.shape[0]):\n", + " if scores[i, 0, POSITIVE_TOKEN_ID] > scores[i, 0, NEGATIVE_TOKEN_ID]:\n", + " preds.append(POSITIVE_TOKEN_ID)\n", + " else:\n", + " preds.append(NEGATIVE_TOKEN_ID)\n", + " return preds\n", + "\n", + "\n", + "@torch.inference_mode()\n", + "def evaluate(model, data):\n", + " loss = 0\n", + " preds = []\n", + " golds = []\n", + "\n", + " for batch in tqdm(data):\n", + " batch = send_to_device(batch)\n", + " loss += model(**batch).loss\n", + " golds.extend(batch[\"labels\"][:, 0].tolist())\n", + " preds.extend(classify(batch))\n", + "\n", + " return loss / len(val), f1_score(golds, preds, pos_label=POSITIVE_TOKEN_ID)\n", + "\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=1e-4)\n", + "scheduler = get_cosine_schedule_with_warmup(optimizer, 200, len(train))\n", + "\n", + "n = 1000\n", + "step = 0\n", + "train_ = tqdm(train)\n", + "\n", + "val_loss, f1 = evaluate(model, val)\n", + "print(\n", + " f\"\"\"\n", + "before source training\n", + "val loss = {val_loss}\n", + "f1 = {f1}\"\"\"\n", + ")\n", + "\n", + "for batch in train_:\n", + " if step % n == 0:\n", + " val_loss, f1 = evaluate(model, val)\n", + " print(\n", + " f\"\"\"\n", + "step = {step}\n", + "val loss = {val_loss}\n", + "f1 = {f1}\"\"\"\n", + " )\n", + " model.save_pretrained(f\"checkpoints_source/{step}\")\n", + "\n", + " step += 1\n", + " batch = send_to_device(batch)\n", + " loss = model(**batch).loss\n", + " loss.backward()\n", + " optimizer.step()\n", + " scheduler.step()\n", + " train_.set_postfix(train_loss=loss)" + ] + }, + { + "cell_type": "markdown", + "id": "74168ef3-66f3-41a7-a40b-7840b103fbf9", + "metadata": {}, + "source": [ + "## target training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b09fd456-163e-4dc1-b24d-f2d0d349036c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "train = DataLoader(MyDataset(\"train\", \"target\"), shuffle=True, batch_size=8, collate_fn=collate_fn)\n", + "val = DataLoader(MyDataset(\"val\", \"target\"), shuffle=False, batch_size=8, collate_fn=collate_fn)\n", + "test = DataLoader(MyDataset(\"test\", \"target\"), shuffle=False, batch_size=8, collate_fn=collate_fn)" + ] + }, + { + "cell_type": "markdown", + "id": "4a539944-f16c-4c3f-bb4a-7b5d9a6042e2", + "metadata": {}, + "source": [ + "#### create a fresh model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5520d904-aa6c-4654-9335-ed4e7d76cba2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "peft_config = MultitaskPromptTuningConfig(\n", + " tokenizer_name_or_path=model_name,\n", + " num_tasks=1,\n", + " task_type=TaskType.SEQ_2_SEQ_LM,\n", + " prompt_tuning_init=MultitaskPromptTuningInit.EXACT_SOURCE_TASK,\n", + " prompt_tuning_init_state_dict_path=\"checkpoints_source/50000/adapter_model.safetensors\",\n", + " num_virtual_tokens=50,\n", + " num_transformer_submodules=1,\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n", + "model = get_peft_model(model, peft_config)\n", + "\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfa39c2d-d1c5-4ed4-90f8-26e8e324371c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "optimizer = AdamW(model.parameters(), lr=1e-4)\n", + "scheduler = get_cosine_schedule_with_warmup(optimizer, 200, len(train))\n", + "\n", + "n = 1000\n", + "step = 0\n", + "train_ = tqdm(train)\n", + "\n", + "val_loss, f1 = evaluate(model, val)\n", + "print(\n", + " f\"\"\"\n", + "before target training\n", + "val loss = {val_loss}\n", + "f1 = {f1}\"\"\"\n", + ")\n", + "\n", + "for batch in train_:\n", + " if step % n == 0:\n", + " val_loss, f1 = evaluate(model, val)\n", + " print(\n", + " f\"\"\"\n", + "step = {step}\n", + "val loss = {val_loss}\n", + "f1 = {f1}\"\"\"\n", + " )\n", + " model.save_pretrained(f\"checkpoints_target/{step}\")\n", + "\n", + " step += 1\n", + " batch = send_to_device(batch)\n", + " loss = model(**batch).loss\n", + " loss.backward()\n", + " optimizer.step()\n", + " scheduler.step()\n", + " train_.set_postfix(train_loss=loss)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6a6eeda-1e09-49a6-8845-cd96c8573145", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# load last checkpoint for now\n", + "from peft import set_peft_model_state_dict\n", + "from safetensors.torch import load_file\n", + "\n", + "sd_6000 = load_file(\"checkpoints_target/6000/adapter_model.safetensors\")\n", + "set_peft_model_state_dict(model, sd_6000)\n", + "\n", + "# evaluate val\n", + "val_loss, f1 = evaluate(model, val)\n", + "print(\n", + " f\"\"\"\n", + "final\n", + "val loss = {val_loss}\n", + "f1 = {f1}\"\"\"\n", + ")\n", + "\n", + "# evaluate test\n", + "test_loss, f1 = evaluate(model, test)\n", + "print(\n", + " f\"\"\"\n", + "final\n", + "test loss = {test_loss}\n", + "f1 = {f1}\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d18325c-9607-4cb5-a5b0-5b44dfee2a75", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43988e92-af42-45cb-8bca-f19c193ad04f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/conditional_generation/peft_adalora_seq2seq.py b/peft/examples/conditional_generation/peft_adalora_seq2seq.py new file mode 100644 index 0000000000000000000000000000000000000000..32f532226bf9639eb326ee20ad6da8c66542a06c --- /dev/null +++ b/peft/examples/conditional_generation/peft_adalora_seq2seq.py @@ -0,0 +1,183 @@ +import os + +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup + +from peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model + + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +model_name_or_path = "facebook/bart-base" +tokenizer_name_or_path = "facebook/bart-base" + +checkpoint_name = "financial_sentiment_analysis_lora_v1.pt" +text_column = "sentence" +label_column = "text_label" +max_length = 128 +lr = 1e-3 +num_epochs = 8 +batch_size = 8 + + +# loading dataset +dataset = load_dataset("financial_phrasebank", "sentences_allagree") +dataset = dataset["train"].train_test_split(test_size=0.1) +dataset["validation"] = dataset["test"] +del dataset["test"] + +classes = dataset["train"].features["label"].names +dataset = dataset.map( + lambda x: {"text_label": [classes[label] for label in x["label"]]}, + batched=True, + num_proc=1, +) + + +# creating model +peft_config = AdaLoraConfig( + init_r=12, + target_r=8, + beta1=0.85, + beta2=0.85, + tinit=200, + tfinal=1000, + deltaT=10, + lora_alpha=32, + lora_dropout=0.1, + task_type=TaskType.SEQ_2_SEQ_LM, + inference_mode=False, + total_step=len(dataset["train"]) * num_epochs, +) + +model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) +model = get_peft_model(model, peft_config) +model.print_trainable_parameters() + + +# data preprocessing +tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + + +def preprocess_function(examples): + inputs = examples[text_column] + targets = examples[label_column] + model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt") + labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt") + labels = labels["input_ids"] + labels[labels == tokenizer.pad_token_id] = -100 + model_inputs["labels"] = labels + return model_inputs + + +processed_datasets = dataset.map( + preprocess_function, + batched=True, + num_proc=1, + remove_columns=dataset["train"].column_names, + load_from_cache_file=False, + desc="Running tokenizer on dataset", +) + +train_dataset = processed_datasets["train"] +eval_dataset = processed_datasets["validation"] + +train_dataloader = DataLoader( + train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True +) +eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True) + + +# optimizer and lr scheduler +optimizer = torch.optim.AdamW(model.parameters(), lr=lr) +lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=(len(train_dataloader) * num_epochs), +) +model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs + + +# training and evaluation +model = model.to(device) +global_step = 0 +for epoch in range(num_epochs): + model.train() + total_loss = 0 + for step, batch in enumerate(tqdm(train_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + total_loss += loss.detach().float() + loss.backward() + optimizer.step() + lr_scheduler.step() + # Update the importance of low-rank matrices + # and allocate the budget accordingly. + model.base_model.update_and_allocate(global_step) + optimizer.zero_grad() + global_step += 1 + + model.eval() + eval_loss = 0 + eval_preds = [] + for step, batch in enumerate(tqdm(eval_dataloader)): + batch = {k: v.to(device) for k, v in batch.items()} + with torch.no_grad(): + outputs = model(**batch) + loss = outputs.loss + eval_loss += loss.detach().float() + eval_preds.extend( + tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True) + ) + + eval_epoch_loss = eval_loss / len(train_dataloader) + eval_ppl = torch.exp(eval_epoch_loss) + train_epoch_loss = total_loss / len(eval_dataloader) + train_ppl = torch.exp(train_epoch_loss) + print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}") + + +# print accuracy +correct = 0 +total = 0 +for pred, true in zip(eval_preds, dataset["validation"]["text_label"]): + if pred.strip() == true.strip(): + correct += 1 + total += 1 +accuracy = correct / total * 100 +print(f"{accuracy=} % on the evaluation dataset") +print(f"{eval_preds[:10]=}") +print(f"{dataset['validation']['text_label'][:10]=}") + + +# saving model +peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}" +model.save_pretrained(peft_model_id) + + +ckpt = f"{peft_model_id}/adapter_model.safetensors" +# get_ipython().system('du -h $ckpt') + + +peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}" + +config = PeftConfig.from_pretrained(peft_model_id) +model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path) +model = PeftModel.from_pretrained(model, peft_model_id) + + +model.eval() +i = 13 +inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="pt") +print(dataset["validation"][text_column][i]) +print(inputs) + +with torch.no_grad(): + outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) + print(outputs) + print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)) diff --git a/peft/examples/conditional_generation/peft_ia3_seq2seq.ipynb b/peft/examples/conditional_generation/peft_ia3_seq2seq.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..155fab5530619797632079c8321aeb83a2c78c57 --- /dev/null +++ b/peft/examples/conditional_generation/peft_ia3_seq2seq.ipynb @@ -0,0 +1,2770 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0c152fc8", + "metadata": { + "id": "5f93b7d1" + }, + "outputs": [], + "source": [ + "from transformers import AutoModelForSeq2SeqLM\n", + "import peft\n", + "from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, IA3Config, TaskType\n", + "import torch\n", + "from datasets import load_dataset\n", + "import os\n", + "\n", + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + "from transformers import AutoTokenizer\n", + "from torch.utils.data import DataLoader\n", + "from transformers import default_data_collator, get_linear_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "from datasets import load_dataset\n", + "\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "model_name_or_path = \"bigscience/mt0-large\"\n", + "tokenizer_name_or_path = \"bigscience/mt0-large\"\n", + "\n", + "checkpoint_name = \"financial_sentiment_analysis_ia3_v1.pt\"\n", + "text_column = \"sentence\"\n", + "label_column = \"text_label\"\n", + "max_length = 128\n", + "lr = 8e-3\n", + "num_epochs = 3\n", + "batch_size = 8" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4e23624f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b9e6368c", + "outputId": "fc2888a8-4fe9-4d61-dd2d-753e751e1416" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "\n", + "importlib.reload(peft)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da74b569", + "metadata": { + "id": "8d0850ac" + }, + "outputs": [], + "source": [ + "# creating model\n", + "peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, feedforward_modules=[])\n", + "\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "df33fce2", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e10c3831", + "outputId": "e69c5e07-ae58-446c-8301-e99ac6b85d62" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MT5ForConditionalGeneration(\n", + " (shared): Embedding(250112, 1024)\n", + " (encoder): MT5Stack(\n", + " (embed_tokens): Embedding(250112, 1024)\n", + " (block): ModuleList(\n", + " (0): MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (relative_attention_bias): Embedding(32, 16)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (1-23): 23 x MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (decoder): MT5Stack(\n", + " (embed_tokens): Embedding(250112, 1024)\n", + " (block): ModuleList(\n", + " (0): MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (relative_attention_bias): Embedding(32, 16)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerCrossAttention(\n", + " (EncDecAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (2): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (1-23): 23 x MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerCrossAttention(\n", + " (EncDecAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (v): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (2): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lm_head): Linear(in_features=1024, out_features=250112, bias=False)\n", + ")" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "63d7bc2d", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "05978e96", + "outputId": "ea9b7d40-010f-4df0-ec64-a7146a5f8b08" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 282,624 || all params: 1,229,863,936 || trainable%: 0.0230\n" + ] + }, + { + "data": { + "text/plain": [ + "PeftModelForSeq2SeqLM(\n", + " (base_model): IA3Model(\n", + " (model): MT5ForConditionalGeneration(\n", + " (shared): Embedding(250112, 1024)\n", + " (encoder): MT5Stack(\n", + " (embed_tokens): Embedding(250112, 1024)\n", + " (block): ModuleList(\n", + " (0): MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (relative_attention_bias): Embedding(32, 16)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", + " )\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (1-23): 23 x MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", + " )\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (decoder): MT5Stack(\n", + " (embed_tokens): Embedding(250112, 1024)\n", + " (block): ModuleList(\n", + " (0): MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (relative_attention_bias): Embedding(32, 16)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerCrossAttention(\n", + " (EncDecAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (2): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", + " )\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (1-23): 23 x MT5Block(\n", + " (layer): ModuleList(\n", + " (0): MT5LayerSelfAttention(\n", + " (SelfAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (1): MT5LayerCrossAttention(\n", + " (EncDecAttention): MT5Attention(\n", + " (q): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (k): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (v): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=1024, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 1024x1])\n", + " )\n", + " (o): Linear(in_features=1024, out_features=1024, bias=False)\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (2): MT5LayerFF(\n", + " (DenseReluDense): MT5DenseGatedActDense(\n", + " (wi_0): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (wi_1): Linear(\n", + " (base_layer): Linear(in_features=1024, out_features=2816, bias=False)\n", + " (ia3_l): ParameterDict( (default): Parameter containing: [torch.FloatTensor of size 2816x1])\n", + " )\n", + " (wo): Linear(in_features=2816, out_features=1024, bias=False)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " (act): NewGELUActivation()\n", + " )\n", + " (layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): MT5LayerNorm()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lm_head): Linear(in_features=1024, out_features=250112, bias=False)\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "155b8728", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140, + "referenced_widgets": [ + "bbfb7533b5ca459194e171df56b79566", + "c894e8237aa34c56bb250acab1466005", + "a5a126b229064812bf3dcb228118be50", + "661e1b29c59a4295b594edfa4f50ff87", + "1bcba805972b484d8b6aa6542c81841c", + "e71f5c7f1d5d4f83b58c68d2fa310d9c", + "6a567e0a1a5447519c5df10e777520cf", + "7aeca19b84904906a04c12659f84ff9e", + "dd4b895874ce46ceb1ad0d9bc973f98f", + "b138f91be7f94008806eaf0a6988bc3f", + "da14180f51ab44b48470cb9ea74d3864", + "9e12d97af6124a5a8c6627708b300c1e", + "faa18df899c14e9cac6721253e6c9128", + "79d0ede7a5b24756aa6d34fda8c29159", + "3b175b452f4347558aa3c4501cc90030", + "fc4637a1b37e4e90874c71aa4271ac74", + "1b8aada826a0451bb60c418b19178c8c", + "a91916e02e9c424e881e45b3aa978574", + "ca509bd409624c998e555c9a779b8aae", + "9c890fc422954347b86d3bde7a421caf", + "6f9453484ea94587a64d70f1b3a1f6e4", + "48770ef159f44c01be2a75c75aecd80f", + "0c561dab67914ea9b6e1aab803600551", + "1e021a1954b44d69a90101a96c360661", + "013e3343285f437a893bdd673fb90e22", + "28802da68fb04d70b1c6bc511a04676f", + "94174da0d6554be087d4527bea5b511a", + "dc8ab16a1e6c4e6893c95ccd16568f9a", + "72383136663448d89cf3b82b87cbb392", + "5b1bdaf16cbc473081e4237f839167b9", + "51f8fb45485540bb985b606d43ae04ea", + "f760cd4758334ca9a43fd15612fd808b", + "f60e9915d2a74ca7bc010d7684f5acf6" + ] + }, + "id": "4ee2babf", + "outputId": "3c413083-247d-47da-f25c-032764be0beb" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using the latest cached version of the dataset since financial_phrasebank couldn't be found on the Hugging Face Hub\n", + "Found the latest cached dataset configuration 'sentences_allagree' at /root/.cache/huggingface/datasets/financial_phrasebank/sentences_allagree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141 (last modified on Thu Jul 31 03:15:41 2025).\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "43b03e9b6de94bf0921228482d7be1e5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/2037 [00:00 Tensor(a!)\n", + " registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6\n", + " dispatch key: XPU\n", + " previous kernel: registered at /pytorch/aten/src/ATen/VmapModeRegistrations.cpp:37\n", + " new kernel: registered at /build/intel-pytorch-extension/build/Release/csrc/gpu/csrc/gpu/xpu/ATen/RegisterXPU_0.cpp:172 (function operator())\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-07-31 07:06:51,984] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to xpu (auto detect)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/bin/ld: cannot find -laio: No such file or directory\n", + "collect2: error: ld returned 1 exit status\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-07-31 07:06:52,955] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [1275/1275 03:31, Epoch 5/5]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation LossAccuracy
12.1699000.5071560.621145
20.5377000.4309960.651982
30.4822000.4267180.696035
40.4597000.4708940.682819
50.4360000.4096040.718062

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=1275, training_loss=0.8170911183076747, metrics={'train_runtime': 213.5513, 'train_samples_per_second': 47.693, 'train_steps_per_second': 5.97, 'total_flos': 344546979840000.0, 'train_loss': 0.8170911183076747, 'epoch': 5.0})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# training and evaluation\n", + "\n", + "\n", + "def compute_metrics(eval_preds):\n", + " preds, labels = eval_preds\n", + " preds = tokenizer.batch_decode(preds, skip_special_tokens=True)\n", + " labels = tokenizer.batch_decode(labels, skip_special_tokens=True)\n", + "\n", + " correct = 0\n", + " total = 0\n", + " for pred, true in zip(preds, labels):\n", + " if pred.strip() == true.strip():\n", + " correct += 1\n", + " total += 1\n", + " accuracy = correct / total\n", + " return {\"accuracy\": accuracy}\n", + "\n", + "\n", + "training_args = Seq2SeqTrainingArguments(\n", + " \"out\",\n", + " per_device_train_batch_size=batch_size,\n", + " learning_rate=lr,\n", + " num_train_epochs=num_epochs,\n", + " eval_strategy=\"epoch\",\n", + " logging_strategy=\"epoch\",\n", + " save_strategy=\"no\",\n", + " report_to=[],\n", + " predict_with_generate=True,\n", + " generation_config=GenerationConfig(max_length=max_length),\n", + ")\n", + "trainer = Seq2SeqTrainer(\n", + " model=model,\n", + " processing_class=tokenizer,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " data_collator=default_data_collator,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a8de6005", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-30T09:53:13.045146Z", + "start_time": "2023-05-30T09:53:13.035612Z" + } + }, + "outputs": [], + "source": [ + "# saving model\n", + "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n", + "model.save_pretrained(peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd20cd4c", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-30T09:53:15.240763Z", + "start_time": "2023-05-30T09:53:15.059304Z" + } + }, + "outputs": [], + "source": [ + "ckpt = f\"{peft_model_id}/adapter_model.safetensors\"\n", + "!du -h $ckpt" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76c2fc29", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-30T09:53:25.055105Z", + "start_time": "2023-05-30T09:53:17.797989Z" + } + }, + "outputs": [], + "source": [ + "from peft import PeftModel, PeftConfig\n", + "\n", + "peft_model_id = f\"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n", + "\n", + "config = PeftConfig.from_pretrained(peft_model_id)\n", + "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n", + "model = PeftModel.from_pretrained(model, peft_model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d997f1cc", + "metadata": { + "ExecuteTime": { + "end_time": "2023-05-30T09:53:26.777030Z", + "start_time": "2023-05-30T09:53:26.013697Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPS grew to 0.04 eur from 0.02 eur .\n", + "{'input_ids': tensor([[ 3, 24935, 3, 4774, 12, 4097, 6348, 3, 1238, 45,\n", + " 4097, 4305, 3, 1238, 3, 5, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n", + "tensor([[ 0, 1465, 1]])\n", + "['positive']\n" + ] + } + ], + "source": [ + "model.eval()\n", + "i = 107\n", + "inputs = tokenizer(dataset[\"validation\"][text_column][i], return_tensors=\"pt\")\n", + "print(dataset[\"validation\"][text_column][i])\n", + "print(inputs)\n", + "\n", + "with torch.no_grad():\n", + " outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n", + " print(outputs)\n", + " print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb746c1e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/conditional_generation/requirements.txt b/peft/examples/conditional_generation/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9571ec3501248ebd05f2442a9a6d7ba6555f22bf --- /dev/null +++ b/peft/examples/conditional_generation/requirements.txt @@ -0,0 +1,8 @@ +transformers +accelerate +evaluate +deepspeed +tqdm +datasets +safetensors +scikit-learn \ No newline at end of file diff --git a/peft/examples/corda_finetuning/README.md b/peft/examples/corda_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d39d2cc09e13a1b7988fbbb8ae905ab598f60da6 --- /dev/null +++ b/peft/examples/corda_finetuning/README.md @@ -0,0 +1,257 @@ +# CorDA: Context-Oriented Decomposition Adaptation of Large Language Models for Task-Aware Parameter-Efficient Fine-tuning + +## Introduction + + +Existing PEFT methods are mostly agnostic of the context of a task of concern, e.g., a downstream task to learn or some pre-trained world knowledge to maintain. +[CorDA](https://openreview.net/pdf?id=Gi00NVru6n) builds task-aware LoRA adapters from weight decomposition oriented by the context of the task concerned. + +Concretely, CorDA randomly collects a few (by default 256 in our `preprocess.py`) data samples from a target task, e.g. questions from a QA dataset or instructions to write a code or solve a math problem, and feeds these samples into a pre-trained LLM. We can obtain the covariance matrix of the input activation of each linear layer, i.e., $C=XX^T\in\mathcal{R}^{d_{in}\times d_{in}}$. +We then perform singular value decomposition (SVD) for the weight $W\in \mathcal{R}^{d_{out}\times d_{in}}$ multiplied by the covariance matrix, i.e., $\verb|SVD|(WC) = U\Sigma V^T$. In this way, the context expressed by these representative covariance matrices is able to orientate the decomposition, such that the principal components (the singular vectors with the largest singular values) are most associated with the task of concern (please refer to Fig.2 of our paper for the advantage of our decomposition over the plain SVD). To ensure the same inference result with the pre-trained model at the start of adaptation, we multiply the inverse of these covariance matrices with the decomposed components, i.e., $\hat{W}=U\Sigma V^T C^{-1}$. + +Thanks to the task-awareness, you can choose how to utilize the task-specific principal components. For examples, if you want to adapt a model to a new task without losing the knowledge of a question-answering dataset, e.g., TriviaQA and NQopen, you can sample questions from this dataset to collect covariance matrices, and keep the principal components frozen because they compact the ability of this dataset, while using the lowest components with the smallest $r$ singular values to initialize the learnable LoRA adapters. This is achieved by the **knowledge-preserved mode (KPM)** of CorDA, which learns new tasks effectively while keeping the world knowledge you are concerned about as sound as possible. Alternatively, when your primary objective is to maximize performance on the finetuning task, disregarding the preservation of world knowledge, the **instruction-previewed mode (IPM**) will be favored. In this mode, CorDA uses the instruction and response from the fine-tuning task (e.g., Math or Code) to produce the covariance matrices. The principal components with the largest $r$ singular values, capturing the characteristics of the finetuning task in advance, can better adapt to the new ability, so they are used to initialize the LoRA adapters, with the remaining components frozen. IPM can further accelerate convergence to enhance the fine-tuning performance on downstream tasks. + + +The implementations of KPM and IPM are compared as follows: + +| Mode | Collect covariance from | LoRA $A$ | LoRA $B$ | +|---|---|---|--- +|KPM | questions from the knowledge benchmark to maintain | $A=\sqrt{\Sigma}\_{[-r:]}(V^T C^{-1})\_{[-r:,:]}$ | $B=U_{[:,-r:]}\sqrt{\Sigma}_{[-r:]}$ | +IPM | instructions and responses from the downstream task to learn | $A= \sqrt{\Sigma}\_{[:r]} (V^T C^{-1})\_{[:r,:]}$ | $B =U_{[:,:r]} \sqrt{\Sigma}_{[:r]}$ | + +### Comparison with alternative methods + +The distinction between CorDA with other similar LoRA initialization methods is summarized as follows: + +| Method | Initialization for | SVD on | Data-driven | Supports knowledge maintenance | +| - | - | - | - | - | +| PiSSA | $A$ and $B$ | weights | no | no | +| EVA | $A$ | activations | yes | no | +|CorDA | $A$ and $B$ | weights (oriented by covariance) | yes | yes | + +"Supports knowledge maintenance" denotes the ability of explicitly associating a knowledge benchmark with some components of the pre-trained weights after decomposition, and keeping these components frozen during fine-tuning. + +### Some Results + +- Performance with knowledge-preserved mode (sample from NQopen, fine-tune on Math) + +| Method | Model | NQ open | GSM8k | Math | Avg. | +|---|---|---|---|---|---| +|Pre-trained|Llama-2-7b| 14.99 | -| - | - | +|LoRA|Llama-2-7b|1.27| 42.68 | 5.88 | 16.61 | +|**CorDA (KPM)** |Llama-2-7b| **8.20** | **46.32** | **7.00** | **20.51** | +|Pre-trained|Llama-2-13b|23.63|-|-|-| +|LoRA|Llama-2-13b| 16.26 | 57.24 | 8.92 | 27.47 | +|**CorDA (KPM)** |Llama-2-13b| **19.86** | **59.29** | **9.62** | **29.59** | +|Pre-trained|Llama-3-8b|13.41|-|-|-| +|LoRA|Llama-3-8b| 8.75 | 72.33 | 24.04| 35.04 | +|**CorDA (KPM)** |Llama-3-8b| **9.61** | **74.68** | **25.34** | **36.54** | +|Pre-trained|Gemma-2-9b|12.85|-|-|-| +|LoRA|Gemma-2-9b| 9.28 | 83.47 | 42.30| 45.02 | +|**CorDA (KPM)** |Gemma-2-9b|**10.17** | **84.08** | **42.64** | **45.63** | + +- Performance with instruction-previewed mode (sample from Math, fine-tune on Math) + +| Method | Model | GSM8k | Math | +| --- | --- | --- | ---| +|LoRA| Llama-2-7b | 42.68 | 5.88 | +|PiSSA | Llama-2-7b | 51.63 | 7.32 | +| **CorDA (IPM)** | Llama-2-7b | **53.45** | **8.64** | +|LoRA| Llama-2-13b | 57.24 | 8.92 | +|PiSSA | Llama-2-13b |60.88 | 11.08| +| **CorDA (IPM)** | Llama-2-13b | **62.47** |**11.54** | +|LoRA| Gemma-2-9b | 83.47 | 42.30 | +|PiSSA | Gemma-2-9b | 84.23 | 43.52| +| **CorDA (IPM)** | Gemma-2-9b | **84.45** | **43.88** | + + +## Quick Start + +### Knowledge-preserved adaptation mode + +```py +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from peft.tuners.lora.config import CordaConfig +from peft.tuners.lora.corda import preprocess_corda +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +tokenizer.pad_token_id = tokenizer.eos_token_id +sampled_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train[:256]") +dataset = load_dataset("imdb", split="train[:256]") + + +def run_model(): + for batch in sampled_dataset: + input_ids = batch["text"] + input_ids = input_ids.to(model.device) + with torch.no_grad(): + model(input_ids) + + +corda_config = CordaConfig( + corda_method="kpm", +) +lora_config = LoraConfig( + init_lora_weights="corda", + corda_config=corda_config, +) + +# Call `preprocess_corda` first to collect covariance matrix and build SVD result for model +# For more details, please refer to documentation of `preprocess_corda` +preprocess_corda(model, lora_config, run_model=run_model) + +# Call `get_peft_model` after preprocessing, or else you'll encounter error +peft_model = get_peft_model(model, lora_config) +peft_model.print_trainable_parameters() + +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + args=training_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("corda-llama-2-7b") +``` + +### Instruction-previewed adaptation mode + +```py +# Get model and dataset identically as KPM... + +# Different from KPM, we run the model on dataset of the downstream task to collect covariance matrices +def run_model(): + for batch in dataset: + input_ids = batch["text"] + input_ids = input_ids.to(model.device) + with torch.no_grad(): + model(input_ids) + +# Different from KPM, we set `corda_method` to `"ipm"` +corda_config = CordaConfig( + corda_method="ipm", +) + +# The rest of training process is identical to KPM... +``` + +## Advanced Usage + +### Preprocessing + +`preprocess.py`: This script builds CorDA adapters for a model, and saves the adapters initial weights and residual model weights to a specified directory. Example usage: + +#### Knowledge-preserved adaptation mode + +```bash +export CUDA_VISIBLE_DEVICES=0 # force to use device 0 of CUDA GPU +export ZE_AFFINITY_MASK=0 # force to use device 0 of Intel XPU + +python -u preprocess.py --model_id="meta-llama/Llama-2-7b-hf" \ + --r 128 --seed 233 \ + --save_model --save_path {path_to_residual_model} \ + --calib_dataset "nqopen" +``` +Arguments: + +- `--model_id` is the pre-trained model for decomposition. +- `--r` is the low rank of LoRA, e.g. 128. +- `--calib_dataset` specifies the dataset to sample data to obtain covariance matrices. KPA mode uses QA datasets such as `"nqopen"`, `"traivia_qa"`, or other choices. +- `--save_model` saves the initialized model in `--save_path`. + +#### Instruction-previewed adaptation mode + +```bash +export CUDA_VISIBLE_DEVICES=0 # force to use device 0 of CUDA GPU +export ZE_AFFINITY_MASK=0 # force to use device 0 of Intel XPU + +python -u preprocess.py --model_id="meta-llama/Llama-2-7b-hf" \ + --r 128 --seed 233 \ + --save_model --save_path {path_to_residual_model} \ + --first_eigen --calib_dataset "MetaMATH" +``` + +Arguments: + +- `--first_eigen` uses the largest $r$ singular values and vectors to initialize the learnable adapter for the instruction-previewed adaptation mode. +- `--calib_dataset` specifies the dataset to sample data to obtain covariance matrices. Instruction-previewed mode uses the downstream task dataset you are learning, such as `"MetaMATH"`, `"codefeedback"`, `"WizLMinstruct"`, `"alpaca"`, or other choices. + +#### Note about memory consumption + +The process of collecting covariance matrices is performed in `torch.float32` by default. If you would like to reduce the memory consumption of preprocessing, you can specify `use_float16_for_covariance=True` in `CordaConfig` to collect covariance matrices in `torch.float16`. But this may cause numerical instability only in a few cases, such that the initialized model does not ensure the exact same inference result as the original model. So it is suggested to check, e.g., comparing the inference result of Wiki/PTB perplexity before and after preprocessing, if you choose to perform in `torch.float16`. + +### Fine-tuning + +`corda_finetuning.py`: This script fine-tunes the preprocessed model built above on a downstream task. + +Example usage: + +```bash +python corda_finetuning.py \ + --model_name_or_path {path_to_residual_model} \ + --output_dir {path_to_output_model} \ + --corda_mode True \ + --data_path meta-math/MetaMathQA \ + --dataset_split "train[:100000]" \ + --dataset_field query response \ + --num_train_epochs 1 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 32 \ + --save_strategy "steps" \ + --save_steps 100 \ + --save_total_limit 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --bf16 True \ + --tf32 True \ + --report_to none +``` + +### Convert CorDA to LoRA + +The main advantage of CorDA is concentrated during the training phase. For a trained CorDA adapter, we recommend converting it equivalently to the LoRA adapter for using and sharing. + +```python +# The fine-tuned matrices $A$ and $B$ in CorDA adapter is saved and should be combined with the residual model. +peft_model.save_pretrained(output_dir) +# Given the matrices $A_0$ and $B_0$, initialized by CorDA and untrained, and the trained matrices $A$ and $B$, +# we can convert these to LoRA by setting $\Delta W = A \times B - A_0 \times B_0 = [A \mid A_0] \times [B \mid -B_0]^T = A'B'$. +peft_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion="corda_init") +``` + +This conversion enables the loading of LoRA on top of a standard base model: + +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto" +) +# No SVD is performed during this step, and the base model remains unaltered. +peft_model = PeftModel.from_pretrained(model, "corda-llama-2-7b-lora") +``` + +Utilizing the converted LoRA does not require modifying the parameters of the base model. When multiple converted LoRAs are needed simultaneously, each adapter operates independently without interference, allowing for the adapters to be freely deleted or added. + +Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. + +## Citation +``` +@inproceedings{yangcorda, + title={CorDA: Context-Oriented Decomposition Adaptation of Large Language Models for Task-Aware Parameter-Efficient Fine-tuning}, + author={Yang, Yibo and Li, Xiaojie and Zhou, Zhongzhu and Song, Shuaiwen Leon and Wu, Jianlong and Nie, Liqiang and Ghanem, Bernard}, + booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems}, + year={2024}, +} +``` diff --git a/peft/examples/corda_finetuning/corda_finetuning.py b/peft/examples/corda_finetuning/corda_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..42b80ee48e723aafa722126fdbd2e5dcfa0fd08f --- /dev/null +++ b/peft/examples/corda_finetuning/corda_finetuning.py @@ -0,0 +1,276 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from collections.abc import Sequence +from dataclasses import dataclass, field +from typing import Optional + +import torch +import transformers +from datasets import load_dataset +from transformers import Trainer + +from peft import LoraConfig, PeftModel, get_peft_model + + +IGNORE_INDEX = -100 + +PROMPT = ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Response:" +) + + +def get_nb_trainable_parameters(model) -> tuple[int, int]: + r""" + Returns the number of trainable parameters and the number of all parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + num_bytes = param.quant_storage.itemsize if hasattr(param, "quant_storage") else 1 + num_params = num_params * 2 * num_bytes + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + model_name_or_path: Optional[str] = field(default="facebook/opt-125m") + data_path: str = field(default=None, metadata={"help": "Path to the training data."}) + dataset_split: str = field(default="train[:100000]", metadata={"help": "(`['train', 'test', 'eval']`):"}) + dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."}) + dataloader_num_proc: int = field(default=16, metadata={"help": "Number of processes to load dataset"}) + dataloader_batch_size: int = field( + default=3000, + metadata={ + "help": "batch size to load dataset. To set the batch size for training, you should pass --batch_size argument instead." + }, + ) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=512, + metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."}, + ) + lora_r: int = field( + default=None, + metadata={"help": "The rank of LoRA adapter. When passing `None`, CorDA or full fine-tuning is used."}, + ) + corda_mode: bool = field(default=True, metadata={"help": "True for CorDA mode"}) + + +def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): + """Collects the state dict and dump to disk.""" + state_dict = trainer.model.state_dict() + if trainer.args.should_save: + cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} + del state_dict + trainer._save(output_dir, state_dict=cpu_state_dict) # noqa + + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + + +def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> dict: + """Tokenize a list of strings.""" + tokenized_list = [ + tokenizer( + text, + return_tensors="pt", + padding="longest", + max_length=tokenizer.model_max_length, + truncation=True, + ) + for text in strings + ] + input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] + input_ids_lens = labels_lens = [ + tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list + ] + return { + "input_ids": input_ids, + "labels": labels, + "input_ids_lens": input_ids_lens, + "labels_lens": labels_lens, + } + + +def preprocess( + sources: Sequence[str], + targets: Sequence[str], + tokenizer: transformers.PreTrainedTokenizer, +) -> dict: + """Preprocess the data by tokenizing.""" + examples = [s + t for s, t in zip(sources, targets)] + examples_tokenized, sources_tokenized = (_tokenize_fn(strings, tokenizer) for strings in (examples, sources)) + input_ids = examples_tokenized["input_ids"] + labels = copy.deepcopy(input_ids) + for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]): + label[:source_len] = IGNORE_INDEX + return { + "input_ids": input_ids, + "labels": labels, + } + + +@dataclass +class DataCollatorForSupervisedDataset: + """Collate examples for supervised fine-tuning.""" + + tokenizer: transformers.PreTrainedTokenizer + + def __call__(self, instances: Sequence[dict]) -> dict[str, torch.Tensor]: + input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels")) + input_ids = [torch.tensor(x) for x in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id + ) + labels = [torch.tensor(x) for x in labels] + labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) + return { + "input_ids": input_ids, + "labels": labels, + "attention_mask": input_ids.ne(self.tokenizer.pad_token_id), + } + + +def train_tokenize_function(examples, tokenizer, query, response): + sources = [ + PROMPT.format_map( + { + "instruction": instruction, + } + ) + for instruction in examples[query] + ] + targets = [f"{output}{tokenizer.eos_token}" for output in examples[response]] + data_dict = preprocess(sources, targets, tokenizer) + return data_dict + + +def train(): + parser = transformers.HfArgumentParser(TrainingArguments) + script_args = parser.parse_args_into_dataclasses()[0] + print(script_args) + + if script_args.corda_mode: + print("Train in CorDA mode") + res_model = transformers.AutoModelForCausalLM.from_pretrained( + script_args.model_name_or_path, + device_map="auto", + ) + model = PeftModel.from_pretrained( + res_model, script_args.model_name_or_path, subfolder="corda_init", is_trainable=True + ) + elif script_args.lora_r is not None: + print("Train in LoRA mode") + model = transformers.AutoModelForCausalLM.from_pretrained( + script_args.model_name_or_path, + device_map="auto", + ) + lora_config = LoraConfig( + r=script_args.lora_r, + lora_alpha=script_args.lora_r, + init_lora_weights=True, # script_args.init_lora_weights, + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + lora_dropout=0, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, lora_config) + else: + print("Train in Full Finetuning mode") + model = transformers.AutoModelForCausalLM.from_pretrained( + script_args.model_name_or_path, + torch_dtype=torch.bfloat16, + device_map="auto", + ) + trainable_params, all_param = get_nb_trainable_parameters(model) + print( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}" + ) + tokenizer = transformers.AutoTokenizer.from_pretrained( + script_args.model_name_or_path, + model_max_length=script_args.model_max_length, + padding_side="right", + use_fast=True, + trust_remote_code=True, + ) + tokenizer.pad_token_id = tokenizer.eos_token_id + + raw_train_datasets = load_dataset(script_args.data_path, split=script_args.dataset_split) + train_dataset = raw_train_datasets.map( + train_tokenize_function, + batched=True, + batch_size=script_args.dataloader_batch_size, + num_proc=script_args.dataloader_num_proc, + remove_columns=raw_train_datasets.column_names, + load_from_cache_file=True, + desc="Running tokenizer on train dataset", + fn_kwargs={ + "tokenizer": tokenizer, + "query": script_args.dataset_field[0], + "response": script_args.dataset_field[1], + }, + ) + + data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) + data_module = { + "train_dataset": train_dataset, + "data_collator": data_collator, + } + trainer = Trainer(model=model, processing_class=tokenizer, args=script_args, **data_module) + trainer.train() + trainer.save_state() + model.save_pretrained(os.path.join(script_args.output_dir, "ft")) + + +if __name__ == "__main__": + train() diff --git a/peft/examples/corda_finetuning/datautils.py b/peft/examples/corda_finetuning/datautils.py new file mode 100644 index 0000000000000000000000000000000000000000..3e612b07af57f57023e8b1a319f38047e2e7c54d --- /dev/null +++ b/peft/examples/corda_finetuning/datautils.py @@ -0,0 +1,235 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random + +import numpy as np +import torch +from datasets import load_dataset + + +""" +doc https://huggingface.co/docs/datasets/loading +doc https://huggingface.co/docs/datasets/process +doc https://huggingface.co/blog/llama2#how-to-prompt-llama-2 +""" + + +def set_seed(seed): + np.random.seed(seed) + torch.random.manual_seed(seed) + + +def sample_train_loaders(name, tokenizer, nsamples=128, seed=0, seqlen=2048): + set_seed(seed) + if "wikitext2" in name: + traindata = load_dataset( + "wikitext", + "wikitext-2-raw-v1", + split="train", + ) + traindata = "\n\n".join(traindata["text"]) + elif "c4" in name: + traindata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", + ) + traindata = "\n\n".join(traindata["text"]) + else: + raise NotImplementedError + + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, len(traindata) - seqlen * 2 - 1) + j = i + seqlen * 2 + # breakpoint() + trainenc = tokenizer(traindata[i:j], return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + trainloader.append(inp) + return trainloader + + +def get_redpajama_train(tokenizer, percent=10, seed=3, batch_size=128, max_length=2048): + def tokenization(example): + return tokenizer(example["text"], truncation=True, max_length=max_length) + + if percent != 100: + split = f"train[:{int(850000 * percent / 100)}]" + else: + split = "train" + dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", split=split) + + processed_dataset = dataset.map(tokenization, batched=True, batch_size=batch_size, num_proc=os.cpu_count()) + return processed_dataset + + +def get_english_quote(dataset_name, tokenizer): + data = load_dataset(dataset_name) + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + return data["train"] + + +def get_qat_dataset(name, tokenizer, data_percent): + if name == "red_pajama": + data = get_redpajama_train(tokenizer, data_percent) + + elif name == "Abirate/english_quotes": + data = get_english_quote(name, tokenizer) + else: + raise NotImplementedError + data = data.shuffle() + return data + + +llama_chat_format = """[INST] <> +"Below is an instruction that describes a task. Write a response that appropriately completes the request." +<> + +{instruction} [/INST] {response} +""" + + +def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3): + print(f" get_data_from: {name}, nsamples={nsamples}, seqlen={seqlen}, {seed}") + cache_file = f"cache/{name}_{model_id.replace('/', '_')}_{nsamples}_{seqlen}_{seed}.pt" + traindataset = [] + if not os.path.exists("cache"): + os.makedirs("cache") + if os.path.exists(cache_file): + print(f"found data file: {cache_file}") + traindataset = torch.load(cache_file) + print("loaded ...") + return traindataset + if name == "c4": + traindata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"train": "en/c4-train.00000-of-01024.json.gz"}, + split="train", + ) + tot_text = "\n\n".join(traindata["text"]) + elif name == "wikitext2": + traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") + tot_text = "\n\n".join(traindata["text"]) + elif name == "ptb": + traindata = load_dataset( + "ptb_text_only", + "penn_treebank", + split="train", + ) + tot_text = "\n\n".join(traindata["sentence"]) + elif name == "traivia_qa": + traindata = load_dataset("trivia_qa", "rc", split="train") + tot_text = "\n\n".join(traindata["question"]) + elif name == "nqopen": + traindata = load_dataset("nq_open", split="train") + tot_text = "\n\n".join(traindata["question"]) + elif name == "alpaca": + selected_data_dict = load_dataset("iboing/alpaca_data", split="train").shuffle(seed=seed).take(nsamples) + for example in selected_data_dict: + if example.get("input", "") == "": + s = llama_chat_format.format(instruction=example["instruction"], response=example["output"]) + trainenc = tokenizer(s, return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + attention_mask = torch.ones_like(inp) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) + print("example instruction:", s) + torch.save(traindataset, cache_file) + return traindataset + elif name == "MetaMATH": + selected_data_dict = load_dataset("iboing/MetaMathQA-395K", split="train").shuffle(seed=seed).take(nsamples) + for example in selected_data_dict: + if example.get("input", "") == "": + s = llama_chat_format.format(instruction=example["query"], response=example["response"]) + trainenc = tokenizer(s, return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + attention_mask = torch.ones_like(inp) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) + print("example instruction:", s) + torch.save(traindataset, cache_file) + return traindataset + elif name == "codefeedback": + selected_data_dict = ( + load_dataset("iboing/CodeFeedback-Filtered-Instruction", split="train").shuffle(seed=seed).take(nsamples) + ) + for example in selected_data_dict: + if example.get("input", "") == "": + s = llama_chat_format.format(instruction=example["query"], response=example["answer"]) + trainenc = tokenizer(s, return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + attention_mask = torch.ones_like(inp) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) + print("example instruction:", s) + torch.save(traindataset, cache_file) + return traindataset + elif name == "WizLMinstruct": + selected_data_dict = ( + load_dataset("iboing/WizardLM_evol_instruct_V2_143k", split="train").shuffle(seed=seed).take(nsamples) + ) + for example in selected_data_dict: + if example.get("input", "") == "": + s = llama_chat_format.format( + instruction=example["conversation"][0]["human"], response=example["conversation"][0]["assistant"] + ) + trainenc = tokenizer(s, return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + attention_mask = torch.ones_like(inp) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) + print("example instruction:", s) + torch.save(traindataset, cache_file) + return traindataset + else: + raise NotImplementedError + print(f"tot_text={len(tot_text)}") + for _ in range(nsamples): + i = random.randint(0, len(tot_text) - seqlen - 1) + j = i + seqlen * 10 + trainenc = tokenizer(tot_text[i:j], return_tensors="pt") + inp = trainenc.input_ids[:, :seqlen] + attention_mask = torch.ones_like(inp) + traindataset.append({"input_ids": inp, "attention_mask": attention_mask}) + torch.save(traindataset, cache_file) + return traindataset + + +def get_eval_loaders(name, tokenizer): + if "wikitext2" in name: + testdata = load_dataset( + "wikitext", + "wikitext-2-raw-v1", + split="test", + ) + testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") + return testenc + if "ptb" in name: + valdata = load_dataset( + "ptb_text_only", + "penn_treebank", + split="validation", + ) + testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt") + return testenc + if "c4" in name: + testdata = load_dataset( + "allenai/c4", + "allenai--c4", + data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, + split="validation", + ) + testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt") + return testenc + raise NotImplementedError diff --git a/peft/examples/corda_finetuning/preprocess.py b/peft/examples/corda_finetuning/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..765242f15e46ff08939702dea4f3b0e73ba93a2c --- /dev/null +++ b/peft/examples/corda_finetuning/preprocess.py @@ -0,0 +1,165 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import numpy as np +import torch +from datautils import get_calib_data +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import get_peft_model +from peft.tuners.lora.config import CordaConfig, LoraConfig +from peft.tuners.lora.corda import preprocess_corda + + +@torch.no_grad() +def run_model(model, calib_loader): + model.eval() + for batch in tqdm(calib_loader): + batch = {k: v.to(model.device) for k, v in batch.items()} + model(**batch) + + +def main(args): + # Setting random seed of numpy and torch + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(args.seed) + elif torch.xpu.is_available(): + torch.xpu.manual_seed_all(args.seed) + torch.use_deterministic_algorithms(True) + + # Load model + model_id = args.model_id + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + model = AutoModelForCausalLM.from_pretrained( + model_id, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True + ) + + # Collect data + calib_loader = get_calib_data(args.calib_dataset, tokenizer, model_id, args.calib_loader_size, seed=args.seed) + + # Evaluate the original model + print("\n---- model before svd ---\n") + print(model) + + # Perform decomposition + corda_config = CordaConfig( + corda_method="ipm" if args.first_eigen else "kpm", + ) + lora_config = LoraConfig( + init_lora_weights="corda", + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + r=args.r, + lora_alpha=args.r, + corda_config=corda_config, + ) + preprocess_corda( + model, + lora_config, + run_model=lambda: run_model(model, calib_loader), + ) + model = get_peft_model(model, lora_config) + + # Evaluate again to check if the model is consistent + # Using `model.model` here because `get_peft_model` wraps a layer to the model + print("\n---- model after svd ---\n") + print(model) + + # Save as hugging face model + if args.save_model: + assert args.save_path is not None + save_path = args.save_path + + # Save CorDA modules + model.peft_config["default"].init_lora_weights = True + model.save_pretrained(os.path.join(save_path, "corda_init")) + + # Save residual model + model = model.unload() + model.save_pretrained(save_path) + + # Save tokenizer + tokenizer.save_pretrained(save_path) + print(f"Done building CorDA huggingface model in {save_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_id", + type=str, + default="meta-llama/Llama-2-7b-hf", + help="Pretrained model ID", + ) + parser.add_argument( + "--calib_loader_size", + type=int, + default=256, + help="number of samples used for covariance matrices", + ) + parser.add_argument( + "--calib_dataset", + type=str, + default="wikitext2", + choices=[ + "wikitext2", + "c4", + "ptb", + "traivia_qa", + "nqopen", + "MetaMATH", + "codefeedback", + "WizLMinstruct", + "alpaca", + ], + help="calibration dataset", + ) + parser.add_argument( + "--eval_mmlu", + action="store_true", + help="evaluate mmlu", + ) + parser.add_argument( + "--seed", + type=int, + default=233, + help="random seed", + ) + parser.add_argument( + "--r", + type=int, + default=None, + ) + parser.add_argument( + "--first_eigen", + action="store_true", + ) + parser.add_argument( + "--save_model", + action="store_true", + ) + parser.add_argument( + "--save_path", + type=str, + default=None, + ) + args = parser.parse_args() + + main(args) diff --git a/peft/examples/cpt_finetuning/README.md b/peft/examples/cpt_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2c84ce91c95e4480168fca7971d3cae188348fea --- /dev/null +++ b/peft/examples/cpt_finetuning/README.md @@ -0,0 +1,64 @@ + +# Context-aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods +## Introduction ([Paper](https://huggingface.co/papers/2410.17222), [Code](https://github.com/tsachiblau/Context-aware-Prompt-Tuning-Advancing-In-Context-Learning-with-Adversarial-Methods), [Notebook](cpt_train_and_inference.ipynb), [Colab](https://colab.research.google.com/drive/1UhQDVhZ9bDlSk1551SuJV8tIUmlIayta?usp=sharing)) + +> Large Language Models (LLMs) can perform few-shot learning using either optimization-based approaches or In-Context Learning (ICL). Optimization-based methods often suffer from overfitting, as they require updating a large number of parameters with limited data. In contrast, ICL avoids overfitting but typically underperforms compared to optimization-based methods and is highly sensitive to the selection, order, and format of demonstration examples. To overcome these challenges, we introduce Context-aware Prompt Tuning (CPT), a method inspired by ICL, Prompt Tuning (PT), and adversarial attacks. CPT builds on the ICL strategy of concatenating examples before the input, extending it by incorporating PT-like learning to refine the context embedding through iterative optimization, extracting deeper insights from the training examples. Our approach carefully modifies specific context tokens, considering the unique structure of the examples within the context. In addition to updating the context with PT-like optimization, CPT draws inspiration from adversarial attacks, adjusting the input based on the labels present in the context while preserving the inherent value of the user-provided data. To ensure robustness and stability during optimization, we employ a projected gradient descent algorithm, constraining token embeddings to remain close to their original values and safeguarding the quality of the context. Our method has demonstrated superior accuracy across multiple classification tasks using various LLM models, outperforming existing baselines and effectively addressing the overfitting challenge in few-shot learning. + + + +

+ +
+CPT optimizing only specific token embeddings while keeping the rest of the model frozen (image source). + +--- + +## Dataset Creation and Collation for CPT + +This document explains how to prepare datasets for CPT, linking the dataset preparation processes in the code to the methods and principles described in the CPT paper, specifically in **Sections 3.1**, **3.2**, and **3.3**. + +--- + +### Template-Based Tokenization + +#### The Role of Templates +Templates define the structure of the input-output pairs, enabling the model to interpret the task within a unified context. + +- **Input Templates**: + Templates like `"input: {sentence}"` structure raw input sentences. The `{sentence}` placeholder is replaced with the actual input text. + +- **Output Templates**: + Templates such as `"output: {label}"` format the labels (e.g., `positive`, `negative`, etc.). + +- **Separator Tokens**: + Separators distinguish different parts of the input, such as the input text and labels, as well as separate examples within the context. + + +#### How CPT Utilizes Context Structure + +CPT leverages the context structure, encoded within the `cpt_tokens_type_mask`, to optimize the context effectively. to optimize the context effectively. By treating different token types based on their roles, the model updates some tokens while using others solely for optimization: + +1. **Refrain from Updating Label Tokens**: + Some context tokens represent label tokens, which contain valuable, unmodifiable information. By excluding these tokens from updates during training, CPT ensures that the labels remain fixed, preserving their integrity. + +2. **Apply Type-Specific Projection Norms**: + CPT employs Projected Gradient Descent (PGD) to update context embeddings, applying tailored norms to different context parts. This approach reduces overfitting while maintaining robustness and generalization by preserving the integrity of user-provided examples. + + + +#### Limitations +CPT is designed for few-shot scenarios, as concatenating more examples increases memory usage due to the self-attention mechanism and additional loss terms. For larger datasets, users can limit the number of context examples and use the remaining samples solely for optimization to manage memory efficiently. + + + + +## Citation +```bib +@article{ + blau2025cpt, + title={Context-Aware Prompt Tuning: Advancing In-Context Learning with Adversarial Methods}, + author={Tsachi Blau, Moshe Kimhi, Yonatan Belinkov, Alexander Bronstein, Chaim Baskin}, + journal={arXiv preprint arXiv:2410.17222}}, + year={2025} +} +``` diff --git a/peft/examples/cpt_finetuning/cpt_train_and_inference.ipynb b/peft/examples/cpt_finetuning/cpt_train_and_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0f3e13c090ad01d37751e5b5238df5868a3584c4 --- /dev/null +++ b/peft/examples/cpt_finetuning/cpt_train_and_inference.ipynb @@ -0,0 +1,1555 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# CPT Training and Inference\n", + "This notebook demonstrates the training and evaluation process of Context-Aware Prompt Tuning (CPT) using the Hugging Face Trainer. For more details, refer to the [Paper](https://huggingface.co/papers/2410.17222).\n", + "\n", + "\n", + "## Sections Overview:\n", + "1. **Setup**: Import libraries and configure the environment.\n", + "2. **Data Preparation**: Load and preprocess the dataset.\n", + "3. **Model Training**: Configure and train the model.\n", + "4. **Evaluation**: Test the model's performance and visualize results." + ], + "metadata": { + "id": "R_byvXT9lpTU" + }, + "id": "R_byvXT9lpTU" + }, + { + "cell_type": "markdown", + "source": [ + "# Setup\n", + "\n", + "---\n", + "\n", + "\n" + ], + "metadata": { + "collapsed": false, + "id": "11b07b07ac5e472b" + }, + "id": "11b07b07ac5e472b" + }, + { + "cell_type": "markdown", + "source": [ + "## Installation" + ], + "metadata": { + "id": "O8DWZb8ZrGRU" + }, + "id": "O8DWZb8ZrGRU" + }, + { + "cell_type": "code", + "source": [ + "!pip install datasets\n", + "!pip install git+https://github.com/huggingface/peft" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d6KZ5REDrFiM", + "outputId": "e505bc0e-082a-4720-9117-b730d9fd67fa" + }, + "id": "d6KZ5REDrFiM", + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (3.1.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.16.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.26.4)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (17.0.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.3.8)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.6)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.70.16)\n", + "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2024.9.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.11.2)\n", + "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.26.2)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.4.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.5.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.1.0)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (0.2.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.17.2)\n", + "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.8.30)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", + "Collecting git+https://github.com/huggingface/peft\n", + " Cloning https://github.com/huggingface/peft to /tmp/pip-req-build-0mbyx_z_\n", + " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft /tmp/pip-req-build-0mbyx_z_\n", + " Resolved https://github.com/huggingface/peft to commit 131efba5d48753a3355ecd4f3833ae010a0510d6\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (24.2)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (5.9.5)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (6.0.2)\n", + "Requirement already satisfied: torch>=1.13.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (2.5.1+cu121)\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (4.46.2)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (4.66.6)\n", + "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (1.1.1)\n", + "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (0.4.5)\n", + "Requirement already satisfied: huggingface_hub>=0.25.0 in /usr/local/lib/python3.10/dist-packages (from peft==0.13.3.dev0) (0.26.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.25.0->peft==0.13.3.dev0) (3.16.1)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.25.0->peft==0.13.3.dev0) (2024.9.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.25.0->peft==0.13.3.dev0) (2.32.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub>=0.25.0->peft==0.13.3.dev0) (4.12.2)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.13.3.dev0) (3.4.2)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.13.3.dev0) (3.1.4)\n", + "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.13.0->peft==0.13.3.dev0) (1.13.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.13.0->peft==0.13.3.dev0) (1.3.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.13.3.dev0) (2024.9.11)\n", + "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers->peft==0.13.3.dev0) (0.20.3)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.13.0->peft==0.13.3.dev0) (3.0.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.25.0->peft==0.13.3.dev0) (3.4.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.25.0->peft==0.13.3.dev0) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.25.0->peft==0.13.3.dev0) (2.2.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub>=0.25.0->peft==0.13.3.dev0) (2024.8.30)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Imports" + ], + "metadata": { + "id": "5BerCvfkq_jp" + }, + "id": "5BerCvfkq_jp" + }, + { + "cell_type": "code", + "source": [ + "from typing import Any, Dict, List, Union\n", + "\n", + "import numpy as np\n", + "import torch\n", + "from datasets import load_dataset\n", + "from torch.utils.data import Dataset\n", + "from tqdm import tqdm\n", + "from transformers import (\n", + " AutoModelForCausalLM,\n", + " AutoTokenizer,\n", + " DataCollatorForLanguageModeling,\n", + " Trainer,\n", + " TrainingArguments,\n", + ")\n", + "\n", + "from peft import CPTConfig, get_peft_model\n", + "\n", + "\n", + "MAX_INPUT_LENGTH = 1024\n", + "MAX_ICL_SAMPLES = 10\n", + "NUM_TRAINING_SAMPLES = 100\n", + "model_id = 'bigscience/bloom-1b7'" + ], + "metadata": { + "id": "Y0pETNFBl963" + }, + "id": "Y0pETNFBl963", + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Data Preparation\n", + "---" + ], + "metadata": { + "id": "9hO_I3aDmCQu" + }, + "id": "9hO_I3aDmCQu" + }, + { + "cell_type": "code", + "source": [ + "# Initialize the tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " model_id, # The name or path of the pre-trained tokenizer (e.g., \"bert-base-uncased\").\n", + " cache_dir='.', # Directory to cache the tokenizer files locally.\n", + " padding_side='right', # Specifies that padding should be added to the right side of sequences.\n", + " trust_remote_code=True # Allows loading tokenizer implementations from external sources.\n", + ")" + ], + "metadata": { + "id": "STK5N0LJrZmA", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "4c5c3dda-07ae-4f67-df29-4a2ff499e5ad" + }, + "id": "STK5N0LJrZmA", + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Load the SST-2 dataset from the GLUE benchmark\n", + "dataset = load_dataset('glue', 'sst2')\n", + "\n", + "def add_string_labels(example):\n", + " \"\"\"\n", + " Converts numerical labels into human-readable string labels.\n", + "\n", + " Args:\n", + " example (dict): A single example from the dataset with a numerical 'label'.\n", + "\n", + " Returns:\n", + " dict: The example augmented with a 'label_text' field.\n", + " \"\"\"\n", + " # Map numerical label to string label\n", + " example['label_text'] = \"positive\" if example['label'] == 1 else \"negative\"\n", + " return example\n", + "\n", + "# Subset and process the training dataset\n", + "context_dataset = dataset['train'].select(range(MAX_ICL_SAMPLES)).map(add_string_labels)\n", + "train_dataset = dataset['train'].select(range(MAX_ICL_SAMPLES, NUM_TRAINING_SAMPLES + MAX_ICL_SAMPLES)).map(add_string_labels)" + ], + "metadata": { + "id": "C3oq4lDDrcUf", + "colab": { + "base_uri": "https://localhost:8080/", + "referenced_widgets": [ + "72a5be4b77ec4d5994bcace9d462da84", + "bed78529ff2c4d08befca97c50cb5efc", + "cf7077acfce04aff8af0a2483dbf094c", + "910462d70d944d00ba54958d77bee755", + "a899818bdad0415b860eaac4afe31f30", + "3d78a6c8923547cf8c75bc8c10125eda", + "8083f95a673a423286ade63051de757d", + "13fc203ab1b44c83b6cfcc1e171d26ad", + "663a0196d2b547fd8a6890b8a86080c2", + "72be01164e974d59b05bee716e9bc978", + "4cedaf37e79e4ff1a10ffb96ec543e81" + ], + "height": 49 + }, + "outputId": "5ae1ff54-d726-4f07-e6d7-cd53145b5d6f" + }, + "id": "C3oq4lDDrcUf", + "execution_count": 4, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Map: 0%| | 0/100 [00:00 0 else 0 # Increment type indices dynamically\n", + " for i in cpt_context_dataset[i]['input_type_mask']\n", + " ]\n", + "\n", + " # Increment the type mask offset after processing the sample\n", + " first_type_mask += 4" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-22T09:24:58.894814Z", + "start_time": "2024-10-22T09:24:58.893841Z" + }, + "id": "aef03bbd5d86d3d8", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "1bb1343b-b5f8-4998-e34b-6a8ae8063381" + }, + "id": "aef03bbd5d86d3d8", + "execution_count": 7 + }, + { + "cell_type": "markdown", + "source": [ + "# Model Training\n", + "\n", + "---" + ], + "metadata": { + "collapsed": false, + "id": "2c40f24774d83372" + }, + "id": "2c40f24774d83372" + }, + { + "cell_type": "markdown", + "source": [ + "## Load model" + ], + "metadata": { + "id": "p0jFTzkisMgN" + }, + "id": "p0jFTzkisMgN" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "# Load a pre-trained causal language model\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " model_id,\n", + " cache_dir='.',\n", + " torch_dtype=torch.float16,\n", + " device_map='auto'\n", + ")\n", + "\n", + "# Initialize the CPT configuration\n", + "config = CPTConfig(\n", + " cpt_token_ids=context_ids,\n", + " cpt_mask=context_attention_mask,\n", + " cpt_tokens_type_mask=context_input_type_mask,\n", + "\n", + " opt_weighted_loss_type='decay',\n", + " opt_loss_decay_factor=0.95, # we choose the exponential decay factor applied to the loss\n", + " opt_projection_epsilon=0.2, # we choose the projection over the input tokens\n", + " opt_projection_format_epsilon=0.1, # we choose the projection over input and output templates\n", + "\n", + " tokenizer_name_or_path=model_id,\n", + ")\n", + "\n", + "# Initialize the CPT model with PEFT\n", + "model = get_peft_model(base_model, config)" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-22T09:25:08.941945Z", + "start_time": "2024-10-22T09:25:04.393323Z" + }, + "id": "17ac445134919a39" + }, + "id": "17ac445134919a39", + "execution_count": 8 + }, + { + "cell_type": "markdown", + "source": [ + "## Setting Collate Function" + ], + "metadata": { + "collapsed": false, + "id": "4e49660c50d98741" + }, + "id": "4e49660c50d98741" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "class CPTDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):\n", + " def __init__(self, tokenizer, training=True, mlm=False):\n", + " \"\"\"\n", + " Custom collator for CPT-style language modeling.\n", + "\n", + " Args:\n", + " tokenizer: The tokenizer to handle tokenization and special tokens.\n", + " training (bool): If True, operates in training mode; otherwise in evaluation mode.\n", + " mlm (bool): If True, enables masked language modeling.\n", + " \"\"\"\n", + "\n", + " super().__init__(tokenizer, mlm=mlm) # Initialize the parent class\n", + " self.training = training\n", + "\n", + " # Add a special padding token if not already defined\n", + " self.tokenizer.add_special_tokens({\"pad_token\": \"[PAD]\"})\n", + "\n", + " def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Process a batch of examples for language modeling.\n", + "\n", + " Args:\n", + " examples (List): A batch of examples with tokenized inputs and optional sample masks.\n", + "\n", + " Returns:\n", + " Dict: A dictionary containing padded and tensor-converted inputs, attention masks,\n", + " input type masks, and optional sample masks and labels.\n", + " \"\"\"\n", + "\n", + " # Initialize a list to collect sample masks if provided\n", + " list_sample_mask = []\n", + " for i in range(len(examples)):\n", + " if \"sample_mask\" in examples[i].keys():\n", + " list_sample_mask.append(examples[i].pop(\"sample_mask\"))\n", + "\n", + " # Define a helper function for padding sequences to the maximum length\n", + " max_len = max(len(ex[\"input_ids\"]) for ex in examples)\n", + "\n", + " # Define a helper function for padding sequences to the maximum length\n", + " def pad_sequence(sequence, max_len, pad_value=0):\n", + " return sequence + [pad_value] * (max_len - len(sequence))\n", + "\n", + " # Pad and convert `input_ids`, `attention_mask`, and `input_type_mask` to tensors\n", + " input_ids = torch.tensor([pad_sequence(ex[\"input_ids\"], max_len) for ex in examples])\n", + " attention_mask = torch.tensor([pad_sequence(ex[\"attention_mask\"], max_len) for ex in examples])\n", + " input_type_mask = torch.tensor([pad_sequence(ex[\"input_type_mask\"], max_len) for ex in examples])\n", + "\n", + " # Create the initial batch dictionary\n", + " batch = {\"input_ids\": input_ids, \"attention_mask\": attention_mask, \"input_type_mask\": input_type_mask}\n", + "\n", + " # Create a tensor to store sample masks\n", + " tensor_sample_mask = batch[\"input_ids\"].clone().long()\n", + " tensor_sample_mask[:, :] = 0 # Initialize with zeros\n", + "\n", + " # Populate the tensor with the provided sample masks\n", + " for i in range(len(list_sample_mask)):\n", + " tensor_sample_mask[i, : len(list_sample_mask[i])] = list_sample_mask[i]\n", + "\n", + " # Copy `input_ids` to use as `labels`\n", + " batch[\"labels\"] = batch[\"input_ids\"].clone()\n", + "\n", + " # If in evaluation mode, include the `sample_mask` in the batch\n", + " if not self.training:\n", + " batch[\"sample_mask\"] = tensor_sample_mask\n", + "\n", + " return batch" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-22T09:25:08.953199Z", + "start_time": "2024-10-22T09:25:08.945689Z" + }, + "id": "b0fac840f060e3aa" + }, + "id": "b0fac840f060e3aa", + "execution_count": 9 + }, + { + "cell_type": "markdown", + "source": [ + "## Training" + ], + "metadata": { + "collapsed": false, + "id": "48f535d74e6602b" + }, + "id": "48f535d74e6602b" + }, + { + "cell_type": "code", + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [500/500 01:28, Epoch 5/5]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
1000.400800
2000.036000
3000.026300
4000.016100
5000.011600

" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=500, training_loss=0.09815525007247924, metrics={'train_runtime': 90.6767, 'train_samples_per_second': 5.514, 'train_steps_per_second': 5.514, 'total_flos': 79477977907200.0, 'train_loss': 0.09815525007247924, 'epoch': 5.0})" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir='../.',\n", + " use_cpu=False,\n", + " auto_find_batch_size=False,\n", + " learning_rate=1e-4,\n", + " logging_steps=100,\n", + " per_device_train_batch_size=1,\n", + " save_total_limit=1,\n", + " remove_unused_columns=False,\n", + " num_train_epochs=5,\n", + " fp16=True,\n", + " save_strategy='no',\n", + " logging_dir=\"logs\",\n", + " report_to=\"none\"\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=cpt_train_dataset, # Custom CPT training dataset.\n", + " data_collator=CPTDataCollatorForLanguageModeling(tokenizer, training=True, mlm=False)\n", + ")\n", + "\n", + "trainer.train()" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-22T09:25:27.599132Z", + "start_time": "2024-10-22T09:25:13.906685Z" + }, + "colab": { + "base_uri": "https://localhost:8080/", + "height": 268 + }, + "id": "1a865c2ad2dc7218", + "outputId": "c4bfd785-e354-4ee6-a87e-63c17bfd2605" + }, + "id": "1a865c2ad2dc7218", + "execution_count": 10 + }, + { + "cell_type": "markdown", + "source": [ + "# Model Evaluation\n", + "\n", + "---" + ], + "metadata": { + "collapsed": false, + "id": "b799ea89a567590f" + }, + "id": "b799ea89a567590f" + }, + { + "cell_type": "code", + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 100/100 [00:00<00:00, 1972.82it/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Sentence: input: it 's a charming and often affecting journey . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: unflinchingly bleak and desperate output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: it 's slow -- very , very slow . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: a sometimes tedious film . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: or doing last year 's taxes with your ex-wife . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the mesmerizing performances of the leads keep the film grounded and keep the audience riveted . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: it takes a strange kind of laziness to waste the talents of robert forster , anne meara , eugene levy , and reginald veljohnson all in the same movie . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: ... the film suffers from a lack of humor ( something needed to balance out the violence ) ... output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: even horror fans will most likely not find what they 're seeking with trouble every day ; the movie lacks both thrills and humor . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: a gorgeous , high-spirited musical from india that exquisitely blends music , dance , song , and high drama . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the emotions are raw and will strike a nerve with anyone who 's ever had family trauma . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: audrey tatou has a knack for picking roles that magnify her outrageous charm , and in this literate french comedy , she 's as morning-glory exuberant as she was in amélie . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: ... the movie is just a plain old monster . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: in its best moments , resembles a bad high school production of grease , without benefit of song . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: pumpkin takes an admirable look at the hypocrisy of political correctness , but it does so with such an uneven tone that you never know when humor ends and tragedy begins . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the iditarod lasts for days - this just felt like it did . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: holden caulfield did it better . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: a delectable and intriguing thriller filled with surprises , read my lips is an original . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: seldom has a movie so closely matched the spirit of a man and his work . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: nicks , seemingly uncertain what 's going to make people laugh , runs the gamut from stale parody to raunchy sex gags to formula romantic comedy . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the action switches between past and present , but the material link is too tenuous to anchor the emotional connections that purport to span a 125-year divide . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: it 's an offbeat treat that pokes fun at the democratic exercise while also examining its significance for those who take part . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: it 's a cookie-cutter movie , a cut-and-paste job . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: i had to look away - this was god awful . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: thanks to scott 's charismatic roger and eisenberg 's sweet nephew , roger dodger is one of the most compelling variations on in the company of men . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: ... designed to provide a mix of smiles and tears , `` crossroads '' instead provokes a handful of unintentional howlers and numerous yawns . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: a gorgeous , witty , seductive movie . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: if the movie succeeds in instilling a wary sense of ` there but for the grace of god , ' it is far too self-conscious to draw you deeply into its world . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: it does n't believe in itself , it has no sense of humor ... it 's just plain bored . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: a sequence of ridiculous shoot - 'em - up scenes . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the weight of the piece , the unerring professionalism of the chilly production , and the fascination embedded in the lurid topic prove recommendation enough . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: ( w ) hile long on amiable monkeys and worthy environmentalism , jane goodall 's wild chimpanzees is short on the thrills the oversize medium demands . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: as surreal as a dream and as detailed as a photograph , as visually dexterous as it is at times imaginatively overwhelming . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: escaping the studio , piccoli is warmly affecting and so is this adroitly minimalist movie . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: there 's ... tremendous energy from the cast , a sense of playfulness and excitement that seems appropriate . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: this illuminating documentary transcends our preconceived vision of the holy land and its inhabitants , revealing the human complexities beneath . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the subtle strength of `` elling '' is that it never loses touch with the reality of the grim situation . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: holm ... embodies the character with an effortlessly regal charisma . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the title not only describes its main characters , but the lazy people behind the camera as well . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: it offers little beyond the momentary joys of pretty and weightless intellectual entertainment . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: a synthesis of cliches and absurdities that seems positively decadent in its cinematic flash and emptiness . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: a subtle and well-crafted ( for the most part ) chiller . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: has a lot of the virtues of eastwood at his best . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: it 's hampered by a lifetime-channel kind of plot and a lead actress who is out of her depth . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: it feels like an after-school special gussied up with some fancy special effects , and watching its rote plot points connect is about as exciting as gazing at an egg timer for 93 minutes . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: for the most part , director anne-sophie birot 's first feature is a sensitive , extraordinarily well-acted drama . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: mr. tsai is a very original artist in his medium , and what time is it there ? output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: sade is an engaging look at the controversial eponymous and fiercely atheistic hero . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: so devoid of any kind of intelligible story that it makes films like xxx and collateral damage seem like thoughtful treatises output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: a tender , heartfelt family drama . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: ... a hollow joke told by a cinematic gymnast having too much fun embellishing the misanthropic tale to actually engage it . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the cold turkey would 've been a far better title . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: manages to be both repulsively sadistic and mundane . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: it 's just disappointingly superficial -- a movie that has all the elements necessary to be a fascinating , involving character study , but never does more than scratch the surface . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: this is a story of two misfits who do n't stand a chance alone , but together they are magnificent . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: schaeffer has to find some hook on which to hang his persistently useless movies , and it might as well be the resuscitation of the middle-aged character . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the primitive force of this film seems to bubble up from the vast collective memory of the combatants . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: on this tricky topic , tadpole is very much a step in the right direction , with its blend of frankness , civility and compassion . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the script kicks in , and mr. hartley 's distended pace and foot-dragging rhythms follow . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: you wonder why enough was n't just a music video rather than a full-length movie . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: if you 're hard up for raunchy college humor , this is your ticket right here . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: a fast , funny , highly enjoyable movie . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: good old-fashioned slash-and-hack is back ! output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: this one is definitely one to skip , even for horror movie fanatics . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: for all its impressive craftsmanship , and despite an overbearing series of third-act crescendos , lily chou-chou never really builds up a head of emotional steam . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: exquisitely nuanced in mood tics and dialogue , this chamber drama is superbly acted by the deeply appealing veteran bouquet and the chilling but quite human berling . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: uses high comedy to evoke surprising poignance . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: one of creepiest , scariest movies to come along in a long , long time , easily rivaling blair witch or the others . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: a string of rehashed sight gags based in insipid vulgarity . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: among the year 's most intriguing explorations of alientation . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the movie fails to live up to the sum of its parts . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the son 's room is a triumph of gentility that earns its moments of pathos . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: there is nothing outstanding about this film , but it is good enough and will likely be appreciated most by sailors and folks who know their way around a submarine . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: this is a train wreck of an action film -- a stupefying attempt by the filmmakers to force-feed james bond into the mindless xxx mold and throw 40 years of cinematic history down the toilet in favor of bright flashes and loud bangs . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: the draw ( for `` big bad love '' ) is a solid performance by arliss howard . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: green might want to hang onto that ski mask , as robbery may be the only way to pay for his next project . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: it 's one pussy-ass world when even killer-thrillers revolve around group therapy sessions . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: though it 's become almost redundant to say so , major kudos go to leigh for actually casting people who look working-class . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the band 's courage in the face of official repression is inspiring , especially for aging hippies ( this one included ) . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the movie achieves as great an impact by keeping these thoughts hidden as ... ( quills ) did by showing them . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: the film flat lines when it should peak and is more missed opportunity and trifle than dark , decadent truffle . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: jaglom ... put ( s ) the audience in the privileged position of eavesdropping on his characters output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: fresnadillo 's dark and jolting images have a way of plying into your subconscious like the nightmare you had a week ago that wo n't go away . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: we know the plot 's a little crazy , but it held my interest from start to finish . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: it 's a scattershot affair , but when it hits its mark it 's brilliant . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: hardly a masterpiece , but it introduces viewers to a good charitable enterprise and some interesting real people . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: you wo n't like roger , but you will quickly recognize him . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: if steven soderbergh 's ` solaris ' is a failure it is a glorious failure . output: positive \n", + " \t The prediction is: negative\n", + " \t The GT is positive\n", + "Sentence: input: byler reveals his characters in a way that intrigues and even fascinates us , and he never reduces the situation to simple melodrama . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: this riveting world war ii moral suspense story deals with the shadow side of american culture : racial prejudice in its ugly and diverse forms . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: it 's difficult to imagine the process that produced such a script , but here 's guessing that spray cheese and underarm noises played a crucial role . output: negative \n", + " \t The prediction is: positive\n", + " \t The GT is negative\n", + "Sentence: input: no sophomore slump for director sam mendes , who segues from oscar winner to oscar-winning potential with a smooth sleight of hand . output: positive \n", + " \t The prediction is: positive\n", + " \t The GT is positive\n", + "Sentence: input: on the whole , the movie lacks wit , feeling and believability to compensate for its incessant coarseness and banality . output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "Sentence: input: why make a documentary about these marginal historical figures ? output: negative \n", + " \t The prediction is: negative\n", + " \t The GT is negative\n", + "The model Acc is 90.0%\n" + ] + } + ], + "source": [ + "model.eval()\n", + "\n", + "# Select relevant columns from the test dataset\n", + "test_dataset = test_dataset.select_columns(['sentence', 'label_text'])\n", + "\n", + "# Convert the test dataset to a CPT-compatible format\n", + "cpt_test_dataset = CPTDataset(test_dataset, tokenizer, templates)\n", + "\n", + "# Get the device where the model is loaded (CPU, GPU or XPU)\n", + "device = model.device\n", + "list_bool_predictions = []\n", + "\n", + "for i in range(len(test_dataset)):\n", + " input_ids, input_type_mask = cpt_test_dataset[i]['input_ids'], cpt_test_dataset[i]['input_type_mask']\n", + "\n", + " # Pass the inputs through the model\n", + " outputs = model(\n", + " input_ids=torch.Tensor(input_ids).long().to(device=device).view(1, -1),\n", + " labels=torch.Tensor(input_ids).long().to(device=device).view(1, -1),\n", + " input_type_mask=torch.Tensor(input_type_mask).long().to(device=device).view(1, -1)\n", + " )\n", + "\n", + " # Shift logits to exclude the last token and match the labels\n", + " shifted_logits = outputs.logits[..., :-1, :].contiguous().to(model.dtype)[0, -len(input_ids) + 1:]\n", + " shift_labels = torch.Tensor(input_ids).long().to(device=device).view(1, -1)[0, 1:].contiguous().to(device)\n", + " shifted_input_type_mask = torch.Tensor(input_type_mask).long().to(device=device).view(1, -1)[..., 1:].contiguous().to(device)\n", + "\n", + " # Create a mask for the type `4` tokens (label tokens)\n", + " mask = torch.Tensor(shifted_input_type_mask).long().to(device=device).view(-1,) == 4\n", + "\n", + " # Extract logits and labels corresponding to the mask\n", + " logit = shifted_logits[mask]\n", + " label = shift_labels[mask]\n", + "\n", + " # All possible label tokens for `negative` and `positive`\n", + " all_labels = torch.Tensor([tokenizer(i, add_special_tokens=False)[\"input_ids\"] for i in ['negative', 'positive']]).long().to(device).view(-1,)\n", + "\n", + " # Compare logits with label tokens and infer prediction\n", + " prediction = logit[0, torch.Tensor([tokenizer(i, add_special_tokens=False)[\"input_ids\"] for i in ['negative', 'positive']]).long().to(device).view(-1,)].argmax()\n", + " prediction_text = 'negative' if prediction == 0 else 'positive'\n", + " print(f\"Sentence: {tokenizer.decode(input_ids)} \\n \\t The prediction is: {prediction_text}\\n \\t The GT is {tokenizer.decode(label)}\")\n", + " list_bool_predictions.append(prediction_text == tokenizer.decode(label))\n", + "\n", + "print(f'The model Acc is {100 * np.mean(list_bool_predictions)}%')" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-22T09:25:28.252009Z", + "start_time": "2024-10-22T09:25:27.598326Z" + }, + "id": "48e7d976e6e01212", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "40dd1226-fa31-4e77-dc7e-e06a3600304e" + }, + "id": "48e7d976e6e01212", + "execution_count": 11 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + }, + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "72a5be4b77ec4d5994bcace9d462da84": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bed78529ff2c4d08befca97c50cb5efc", + "IPY_MODEL_cf7077acfce04aff8af0a2483dbf094c", + "IPY_MODEL_910462d70d944d00ba54958d77bee755" + ], + "layout": "IPY_MODEL_a899818bdad0415b860eaac4afe31f30" + } + }, + "bed78529ff2c4d08befca97c50cb5efc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d78a6c8923547cf8c75bc8c10125eda", + "placeholder": "​", + "style": "IPY_MODEL_8083f95a673a423286ade63051de757d", + "value": "Map: 100%" + } + }, + "cf7077acfce04aff8af0a2483dbf094c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_13fc203ab1b44c83b6cfcc1e171d26ad", + "max": 100, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_663a0196d2b547fd8a6890b8a86080c2", + "value": 100 + } + }, + "910462d70d944d00ba54958d77bee755": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_72be01164e974d59b05bee716e9bc978", + "placeholder": "​", + "style": "IPY_MODEL_4cedaf37e79e4ff1a10ffb96ec543e81", + "value": " 100/100 [00:00<00:00, 1327.06 examples/s]" + } + }, + "a899818bdad0415b860eaac4afe31f30": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d78a6c8923547cf8c75bc8c10125eda": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8083f95a673a423286ade63051de757d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "13fc203ab1b44c83b6cfcc1e171d26ad": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "663a0196d2b547fd8a6890b8a86080c2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "72be01164e974d59b05bee716e9bc978": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4cedaf37e79e4ff1a10ffb96ec543e81": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/dna_language_models/dna_lm.ipynb b/peft/examples/dna_language_models/dna_lm.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..70360879dec511676290762c6e8bd67a9c6221de --- /dev/null +++ b/peft/examples/dna_language_models/dna_lm.ipynb @@ -0,0 +1,2860 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "db4dc272-88fe-47ad-98fd-b94d4f840dca", + "metadata": { + "id": "db4dc272-88fe-47ad-98fd-b94d4f840dca" + }, + "source": [ + "# PEFT with DNA Language Models" + ] + }, + { + "cell_type": "markdown", + "id": "d381f473-0d37-4b5b-ae9e-d2b32bab7c04", + "metadata": { + "id": "d381f473-0d37-4b5b-ae9e-d2b32bab7c04" + }, + "source": [ + "This notebook demonstrates how to utilize parameter-efficient fine-tuning techniques (PEFT) from the PEFT library to fine-tune a DNA Language Model (DNA-LM). The fine-tuned DNA-LM will be applied to solve a task from the nucleotide benchmark dataset. Parameter-efficient fine-tuning (PEFT) techniques are crucial for adapting large pre-trained models to specific tasks with limited computational resources." + ] + }, + { + "cell_type": "markdown", + "id": "23f460c3-d7e5-437f-a5e9-d029cd225bf8", + "metadata": { + "id": "23f460c3-d7e5-437f-a5e9-d029cd225bf8" + }, + "source": [ + "### 1. Import relevant libraries" + ] + }, + { + "cell_type": "markdown", + "id": "29a35f95-738a-4f5e-88ce-dc5f8f9be5dc", + "metadata": { + "id": "29a35f95-738a-4f5e-88ce-dc5f8f9be5dc" + }, + "source": [ + "We'll start by importing the required libraries, including the PEFT library and other dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0a40abdf-ca1c-436f-a2af-603cd67a45a4", + "metadata": { + "id": "0a40abdf-ca1c-436f-a2af-603cd67a45a4" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/anaconda3/envs/peft/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import torch\n", + "import transformers\n", + "import peft\n", + "import tqdm\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "a445f8be-545d-4085-a5f9-c64983655224", + "metadata": { + "id": "a445f8be-545d-4085-a5f9-c64983655224" + }, + "source": [ + "### 2. Load models\n" + ] + }, + { + "cell_type": "markdown", + "id": "63782b55-1c38-4e44-b003-e57daa813bed", + "metadata": { + "id": "63782b55-1c38-4e44-b003-e57daa813bed" + }, + "source": [ + "We'll load a pre-trained DNA Language Model, \"SpeciesLM\", that serves as the base for fine-tuning. This is done using the transformers library from HuggingFace.\n", + "\n", + "The tokenizer and the model comes from the paper, \"Species-aware DNA language models capture regulatory elements and their evolution\". [Paper Link](https://www.biorxiv.org/content/10.1101/2023.01.26.525670v2), [Code Link](https://github.com/gagneurlab/SpeciesLM). They introduce a species-aware DNA language model, which is trained on more than 800 species spanning over 500 million years of evolution." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dac961f4-c450-4124-923e-f4ba9bbd5e07", + "metadata": { + "id": "dac961f4-c450-4124-923e-f4ba9bbd5e07" + }, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModelForMaskedLM" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e73fae58-03e9-4acc-b0fc-9bc810c7d366", + "metadata": { + "id": "e73fae58-03e9-4acc-b0fc-9bc810c7d366" + }, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(\"gagneurlab/SpeciesLM\", revision = \"downstream_species_lm\")\n", + "lm = AutoModelForMaskedLM.from_pretrained(\"gagneurlab/SpeciesLM\", revision = \"downstream_species_lm\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca43b893-2d66-4e93-a08f-b17a92040709", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ca43b893-2d66-4e93-a08f-b17a92040709", + "outputId": "ccbac964-a329-414d-f537-3cae7da66cf2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "BertForMaskedLM(\n", + " (bert): BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(5504, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSdpaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (cls): BertOnlyMLMHead(\n", + " (predictions): BertLMPredictionHead(\n", + " (transform): BertPredictionHeadTransform(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (transform_act_fn): GELUActivation()\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " )\n", + " (decoder): Linear(in_features=768, out_features=5504, bias=True)\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lm.eval()\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "lm.to(device);" + ] + }, + { + "cell_type": "markdown", + "id": "c1bda6f2-34bb-4ce2-aa3f-3013548b0a28", + "metadata": { + "id": "c1bda6f2-34bb-4ce2-aa3f-3013548b0a28" + }, + "source": [ + "### 2. Prepare datasets" + ] + }, + { + "cell_type": "markdown", + "id": "f4c61e59-457c-47d9-8929-5e8cd32d3125", + "metadata": { + "id": "f4c61e59-457c-47d9-8929-5e8cd32d3125" + }, + "source": [ + "We'll load the `nucleotide_transformer_downstream_tasks` dataset, which contains 18 downstream tasks from the Nucleotide Transformer paper. This dataset provides a consistent genomics benchmark with binary classification tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5c0b3df-911a-4645-9140-99ee489515e8", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "03bba232d3974119acf8031bc086a072", + "9107f7bfc8d3483390f802b0458e9380", + "f5c80fa70ead4c86aa3b2a046061b901", + "57966a469ca1458daab74e81672ae855", + "1464502dc3dd46308be8b4fcc9d5ddb9", + "92f64c7e088342b9b3c070ba7a295ed0", + "ab0aa8af3816422e9d97934f12af842c", + "ff89a891bd9c42a8be164587a94ccac1", + "e113a50f8ed2410ca12ce7cb38a1681d", + "1afa6e9b69c74136863b7747e62a0608", + "0838d19b226d486285a26ce0b04d7e15", + "7bdab33f4b244fc89408b91755bf17c5", + "4d4ce0d35c124690b3427e84a9a128b1", + "33be6b0ca8fd44188f834a48a9574a72", + "74e9bc1ead434ae78077df6b85f1df58", + "e1acc6e70b9246a5b063b3e262f01c81", + "078c6877377a491d97d6fadd27064a76", + "d46ee1c39bac44c2b541a88c883de1cb", + "12f1de7122a7471e90f01d9e7be81178", + "dad286d42a514c9ca6bb01bfe9e9c4be", + "c028ed977b5e479fbd93b8add588a6dc", + "6d80dec073e449efba272fa9f3527922", + "c311b777514f41ef986756a386c0bb34", + "e2e4bf053ce442f6aee6ffab5f76525f", + "c88cf701e20b4354a63ac7d8645d1df9", + "f71c252ada474be882b0335ed9a0a1c3", + "e059c665229e46ea905dcbd6fc179c88", + "bd5273325a4b453e8053d98a09fe9493", + "8f20ed2b74d84e80a8d403793354adea", + "57c9af47364d48ffbb4ffbdd2c951ede", + "fa9d75fcb1d5400c8ca1d1d13d28d0c7", + "682644a713b145f0b2dcff99790c6d4d", + "9b9b9d573d44464f9a6f5030a40245fe", + "ec165fdbe87a4b00a6c288ef1e85c0a9", + "17859b793a304e389d1ea0b9ccc3646f", + "34921fd116cc42b7b530174d9f61e71e", + "2d5466a5e98849c5a09f16faa98f91da", + "952397f9c91c480184fa57e175ab1b4c", + "86bcccb842244f4f9add58f62facaace", + "78b5bbf4c8ac4fe5961776fded4d5798", + "c80062a855cb41a28ac625ab03635da2", + "aecd740c17c84d45b0615d4fc4196035", + "39640709e7174f84a50da05764abbf99", + "7114a029e75c4ed5b966eddd3a3c919d" + ] + }, + "id": "f5c0b3df-911a-4645-9140-99ee489515e8", + "outputId": "15315be1-9d07-4c46-acda-c65cb5a05250" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "03bba232d3974119acf8031bc086a072", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading data: 0%| | 0.00/3.50M [00:00\n", + " \n", + " \n", + " [65/65 01:43, Epoch 5/5]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss
10.8874000.685295
20.6447000.682495
30.5996000.680431
40.8928000.679170
50.6638000.678761

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=65, training_loss=0.7263066686116733, metrics={'train_runtime': 104.8696, 'train_samples_per_second': 9.536, 'train_steps_per_second': 0.62, 'total_flos': 0.0, 'train_loss': 0.7263066686116733, 'epoch': 5.0})" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import Trainer, TrainingArguments\n", + "\n", + "\n", + "# Define training arguments\n", + "training_args = TrainingArguments(\n", + " output_dir='./results',\n", + " eval_strategy=\"epoch\",\n", + " learning_rate=2e-5,\n", + " per_device_train_batch_size=16,\n", + " per_device_eval_batch_size=16,\n", + " num_train_epochs=5,\n", + " weight_decay=0.01,\n", + " eval_steps=1,\n", + " logging_steps=1,\n", + ")\n", + "\n", + "# Initialize Trainer\n", + "trainer = Trainer(\n", + " model=classification_model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + ")\n", + "\n", + "# Train the model\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "ebc7e33a-caad-4412-84e3-3e1ce7d02ccd", + "metadata": { + "id": "ebc7e33a-caad-4412-84e3-3e1ce7d02ccd" + }, + "source": [ + "### 5. Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "38eb0273-ce7e-4770-8457-2f9609f6843b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 124 + }, + "id": "38eb0273-ce7e-4770-8457-2f9609f6843b", + "outputId": "2b0b93c9-0199-4e71-9825-9f6a2bd199d0" + }, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 1 1 0 1 1\n", + " 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 1 0 1 1 0 1\n", + " 0 1 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1\n", + " 1 0 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1\n", + " 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1\n", + " 0 1 1 1 1 0 1 1 0 0 1 0 1 1 0]\n" + ] + } + ], + "source": [ + "# Generate predictions\n", + "\n", + "predictions = trainer.predict(test_dataset)\n", + "logits = predictions.predictions\n", + "predicted_labels = logits.argmax(axis=-1)\n", + "print(predicted_labels)" + ] + }, + { + "cell_type": "markdown", + "id": "ae4c7bca", + "metadata": { + "id": "ae4c7bca" + }, + "source": [ + "Then, we create a function to calculate the accuracy from the test and predicted labels." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "327a1c3b-88d6-4430-8978-73a7cbdbb697", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "327a1c3b-88d6-4430-8978-73a7cbdbb697", + "outputId": "f03ad54d-d35f-4fcc-e709-c24d14906e25" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.53\n" + ] + } + ], + "source": [ + "def calculate_accuracy(true_labels, predicted_labels):\n", + "\n", + " assert len(true_labels) == len(predicted_labels), \"Arrays must have the same length\"\n", + " correct_predictions = np.sum(true_labels == predicted_labels)\n", + " accuracy = correct_predictions / len(true_labels)\n", + "\n", + " return accuracy\n", + "\n", + "accuracy = calculate_accuracy(test_labels, predicted_labels)\n", + "print(f\"Accuracy: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "9p0fFXKTZz9Q", + "metadata": { + "id": "9p0fFXKTZz9Q" + }, + "source": [ + "The results aren't that good, which we can attribute to the small dataset size." + ] + }, + { + "cell_type": "markdown", + "id": "e681864c-f15a-40a6-ac34-0e631d68d5c8", + "metadata": { + "id": "e681864c-f15a-40a6-ac34-0e631d68d5c8" + }, + "source": [ + "### 7. Parameter Efficient Fine-Tuning Techniques" + ] + }, + { + "cell_type": "markdown", + "id": "9141fabe-417b-4fbb-bd3e-244ad84e3010", + "metadata": { + "id": "9141fabe-417b-4fbb-bd3e-244ad84e3010" + }, + "source": [ + "In this section, we demonstrate how to employ parameter-efficient fine-tuning (PEFT) techniques to adapt a pre-trained model for specific genomics tasks using the PEFT library." + ] + }, + { + "cell_type": "markdown", + "id": "71b8a749-461e-4533-b1d0-cebc924d3dc0", + "metadata": { + "id": "71b8a749-461e-4533-b1d0-cebc924d3dc0" + }, + "source": [ + "The LoraConfig object is instantiated to configure the PEFT parameters:\n", + "\n", + "- task_type: Specifies the type of task, in this case, sequence classification (SEQ_CLS).\n", + "- r: The rank of the LoRA matrices.\n", + "- lora_alpha: Scaling factor for adaptive re-parameterization.\n", + "- target_modules: Modules within the model to apply PEFT re-parameterization (query, key, value in this example).\n", + "- lora_dropout: Dropout rate used during PEFT fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "021641ae-f604-4d69-8724-743b7d7c613c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "021641ae-f604-4d69-8724-743b7d7c613c", + "outputId": "d7c41fca-1c6b-46fd-9116-01f42d1d6ddf" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DNA_LM(\n", + " (model): BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(5504, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSdpaSelfAttention(\n", + " (query): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (key): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (value): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", + ")" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of classes for your classification task\n", + "num_labels = 2\n", + "classification_model = DNA_LM(lm, num_labels)\n", + "classification_model.to(device);" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "6c223937-86ea-42ef-991a-050f23b21ef9", + "metadata": { + "id": "6c223937-86ea-42ef-991a-050f23b21ef9" + }, + "outputs": [], + "source": [ + "from peft import LoraConfig, TaskType\n", + "\n", + "peft_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha=32,\n", + " target_modules=[\"query\", \"key\", \"value\"],\n", + " lora_dropout=0.01,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e7a9fe7d-e3ac-4ffa-9a9b-2067fb09b885", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e7a9fe7d-e3ac-4ffa-9a9b-2067fb09b885", + "outputId": "02a6c65f-7474-4bc1-bfab-c05532e350a5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 442,368 || all params: 90,121,730 || trainable%: 0.4909\n" + ] + } + ], + "source": [ + "from peft import get_peft_model\n", + "\n", + "peft_model = get_peft_model(classification_model, peft_config)\n", + "peft_model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "22064519-eaab-4142-8618-d1210d05c6bd", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "22064519-eaab-4142-8618-d1210d05c6bd", + "outputId": "ca3f764d-cdb4-4525-c541-8eabfb4cde57" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PeftModel(\n", + " (base_model): LoraModel(\n", + " (model): DNA_LM(\n", + " (model): BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(5504, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSdpaSelfAttention(\n", + " (query): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (key): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (value): lora.Linear(\n", + " (base_layer): Linear(in_features=768, out_features=768, bias=True)\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.01, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=768, out_features=8, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=8, out_features=768, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " )\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (classifier): Linear(in_features=768, out_features=2, bias=True)\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "peft_model" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d3812e96-6b49-4911-8b21-d8871b7c06a5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 268 + }, + "id": "d3812e96-6b49-4911-8b21-d8871b7c06a5", + "outputId": "8d497e30-1d3f-457a-f62a-244731698cb2" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [65/65 01:39, Epoch 5/5]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss
10.6257000.777132
20.7172000.773871
30.7682000.771541
40.6874000.769679
50.5520000.768947

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=65, training_loss=0.74742647592838, metrics={'train_runtime': 100.8429, 'train_samples_per_second': 9.916, 'train_steps_per_second': 0.645, 'total_flos': 0.0, 'train_loss': 0.74742647592838, 'epoch': 5.0})" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define training arguments\n", + "training_args = TrainingArguments(\n", + " output_dir='./results',\n", + " eval_strategy=\"epoch\",\n", + " learning_rate=2e-5,\n", + " per_device_train_batch_size=16,\n", + " per_device_eval_batch_size=16,\n", + " num_train_epochs=5,\n", + " weight_decay=0.01,\n", + " eval_steps=1,\n", + " logging_steps=1,\n", + ")\n", + "\n", + "# Initialize Trainer\n", + "trainer = Trainer(\n", + " model=peft_model.model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + ")\n", + "\n", + "# Train the model\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "76dbd948-d919-4ade-a405-cec297979577", + "metadata": { + "id": "76dbd948-d919-4ade-a405-cec297979577" + }, + "source": [ + "### 8. Evaluate PEFT Model" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "58cf70ba-47d5-4111-bb12-830ae04c6285", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 124 + }, + "id": "58cf70ba-47d5-4111-bb12-830ae04c6285", + "outputId": "0abc56a9-bd68-4e4e-9f13-756e8c9ffa3e" + }, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1\n", + " 1 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1\n", + " 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 0 1 0 0 1 1 0 1 1 1 0\n", + " 1 1 0 0 1 0 1 1 1 0 1 1 0 1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 1 1\n", + " 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 1 0\n", + " 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0]\n" + ] + } + ], + "source": [ + "# Generate predictions\n", + "\n", + "predictions = trainer.predict(test_dataset)\n", + "logits = predictions.predictions\n", + "predicted_labels = logits.argmax(axis=-1)\n", + "print(predicted_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "4bd38fe5-6513-4c88-afee-0cc4e1781fdd", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4bd38fe5-6513-4c88-afee-0cc4e1781fdd", + "outputId": "a50a91d0-d04d-4620-9006-868716bb992d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.52\n" + ] + } + ], + "source": [ + "def calculate_accuracy(true_labels, predicted_labels):\n", + "\n", + " assert len(true_labels) == len(predicted_labels), \"Arrays must have the same length\"\n", + " correct_predictions = np.sum(true_labels == predicted_labels)\n", + " accuracy = correct_predictions / len(true_labels)\n", + "\n", + " return accuracy\n", + "\n", + "accuracy = calculate_accuracy(test_labels, predicted_labels)\n", + "print(f\"Accuracy: {accuracy:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4ba5af69", + "metadata": {}, + "source": [ + "As we can see, the PEFT model achieves similar performance to the baseline model, demonstrating the effectiveness of PEFT in adapting pre-trained models to specific tasks with limited computational resources.\n", + "\n", + "With PEFT, we only train 442,368 parameters, which is 0.49% of the total parameters in the model. This is a significant reduction in computational resources compared to training the entire model from scratch.\n", + "\n", + "We can improve the results by using a larger dataset, fine-tuning the model for more epochs or changing the hyperparameters (rank, learning rate, etc.).\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "03bba232d3974119acf8031bc086a072": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9107f7bfc8d3483390f802b0458e9380", + "IPY_MODEL_f5c80fa70ead4c86aa3b2a046061b901", + "IPY_MODEL_57966a469ca1458daab74e81672ae855" + ], + "layout": "IPY_MODEL_1464502dc3dd46308be8b4fcc9d5ddb9" + } + }, + "078c6877377a491d97d6fadd27064a76": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0838d19b226d486285a26ce0b04d7e15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "12f1de7122a7471e90f01d9e7be81178": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1464502dc3dd46308be8b4fcc9d5ddb9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17859b793a304e389d1ea0b9ccc3646f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_86bcccb842244f4f9add58f62facaace", + "placeholder": "​", + "style": "IPY_MODEL_78b5bbf4c8ac4fe5961776fded4d5798", + "value": "Generating test split: 100%" + } + }, + "1afa6e9b69c74136863b7747e62a0608": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d5466a5e98849c5a09f16faa98f91da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_39640709e7174f84a50da05764abbf99", + "placeholder": "​", + "style": "IPY_MODEL_7114a029e75c4ed5b966eddd3a3c919d", + "value": " 1497/1497 [00:00<00:00, 41394.98 examples/s]" + } + }, + "33be6b0ca8fd44188f834a48a9574a72": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_12f1de7122a7471e90f01d9e7be81178", + "max": 390606, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_dad286d42a514c9ca6bb01bfe9e9c4be", + "value": 390606 + } + }, + "34921fd116cc42b7b530174d9f61e71e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c80062a855cb41a28ac625ab03635da2", + "max": 1497, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aecd740c17c84d45b0615d4fc4196035", + "value": 1497 + } + }, + "39640709e7174f84a50da05764abbf99": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4d4ce0d35c124690b3427e84a9a128b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_078c6877377a491d97d6fadd27064a76", + "placeholder": "​", + "style": "IPY_MODEL_d46ee1c39bac44c2b541a88c883de1cb", + "value": "Downloading data: 100%" + } + }, + "57966a469ca1458daab74e81672ae855": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1afa6e9b69c74136863b7747e62a0608", + "placeholder": "​", + "style": "IPY_MODEL_0838d19b226d486285a26ce0b04d7e15", + "value": " 3.50M/3.50M [00:00<00:00, 26.3MB/s]" + } + }, + "57c9af47364d48ffbb4ffbdd2c951ede": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "682644a713b145f0b2dcff99790c6d4d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6d80dec073e449efba272fa9f3527922": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7114a029e75c4ed5b966eddd3a3c919d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "74e9bc1ead434ae78077df6b85f1df58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c028ed977b5e479fbd93b8add588a6dc", + "placeholder": "​", + "style": "IPY_MODEL_6d80dec073e449efba272fa9f3527922", + "value": " 391k/391k [00:00<00:00, 3.34MB/s]" + } + }, + "78b5bbf4c8ac4fe5961776fded4d5798": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7bdab33f4b244fc89408b91755bf17c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4d4ce0d35c124690b3427e84a9a128b1", + "IPY_MODEL_33be6b0ca8fd44188f834a48a9574a72", + "IPY_MODEL_74e9bc1ead434ae78077df6b85f1df58" + ], + "layout": "IPY_MODEL_e1acc6e70b9246a5b063b3e262f01c81" + } + }, + "86bcccb842244f4f9add58f62facaace": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f20ed2b74d84e80a8d403793354adea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9107f7bfc8d3483390f802b0458e9380": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_92f64c7e088342b9b3c070ba7a295ed0", + "placeholder": "​", + "style": "IPY_MODEL_ab0aa8af3816422e9d97934f12af842c", + "value": "Downloading data: 100%" + } + }, + "92f64c7e088342b9b3c070ba7a295ed0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "952397f9c91c480184fa57e175ab1b4c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9b9b9d573d44464f9a6f5030a40245fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ab0aa8af3816422e9d97934f12af842c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aecd740c17c84d45b0615d4fc4196035": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bd5273325a4b453e8053d98a09fe9493": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c028ed977b5e479fbd93b8add588a6dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c311b777514f41ef986756a386c0bb34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e2e4bf053ce442f6aee6ffab5f76525f", + "IPY_MODEL_c88cf701e20b4354a63ac7d8645d1df9", + "IPY_MODEL_f71c252ada474be882b0335ed9a0a1c3" + ], + "layout": "IPY_MODEL_e059c665229e46ea905dcbd6fc179c88" + } + }, + "c80062a855cb41a28ac625ab03635da2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c88cf701e20b4354a63ac7d8645d1df9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57c9af47364d48ffbb4ffbdd2c951ede", + "max": 13468, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fa9d75fcb1d5400c8ca1d1d13d28d0c7", + "value": 13468 + } + }, + "d46ee1c39bac44c2b541a88c883de1cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dad286d42a514c9ca6bb01bfe9e9c4be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e059c665229e46ea905dcbd6fc179c88": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e113a50f8ed2410ca12ce7cb38a1681d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e1acc6e70b9246a5b063b3e262f01c81": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2e4bf053ce442f6aee6ffab5f76525f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bd5273325a4b453e8053d98a09fe9493", + "placeholder": "​", + "style": "IPY_MODEL_8f20ed2b74d84e80a8d403793354adea", + "value": "Generating train split: 100%" + } + }, + "ec165fdbe87a4b00a6c288ef1e85c0a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_17859b793a304e389d1ea0b9ccc3646f", + "IPY_MODEL_34921fd116cc42b7b530174d9f61e71e", + "IPY_MODEL_2d5466a5e98849c5a09f16faa98f91da" + ], + "layout": "IPY_MODEL_952397f9c91c480184fa57e175ab1b4c" + } + }, + "f5c80fa70ead4c86aa3b2a046061b901": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ff89a891bd9c42a8be164587a94ccac1", + "max": 3495021, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e113a50f8ed2410ca12ce7cb38a1681d", + "value": 3495021 + } + }, + "f71c252ada474be882b0335ed9a0a1c3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_682644a713b145f0b2dcff99790c6d4d", + "placeholder": "​", + "style": "IPY_MODEL_9b9b9d573d44464f9a6f5030a40245fe", + "value": " 13468/13468 [00:00<00:00, 193879.37 examples/s]" + } + }, + "fa9d75fcb1d5400c8ca1d1d13d28d0c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ff89a891bd9c42a8be164587a94ccac1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/dora_finetuning/QDoRA_finetuning.ipynb b/peft/examples/dora_finetuning/QDoRA_finetuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e0582a9eff8437a27c38bde39a675f9b787efb70 --- /dev/null +++ b/peft/examples/dora_finetuning/QDoRA_finetuning.ipynb @@ -0,0 +1,8545 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "CV_gQs58bsvM" + }, + "source": [ + "# Fine-tuning [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QDora (quantized Lora w/ use_dora=True)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FuXIFTFapAMI", + "outputId": "b95d8260-65bd-405f-f1e2-8d353aa46814" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.8/119.8 MB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m37.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m309.4/309.4 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for accelerate (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m16.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m10.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n", + "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "# Install the libraries\n", + "!pip install -q -U bitsandbytes\n", + "!pip install -q -U git+https://github.com/huggingface/transformers.git\n", + "!pip install -q -U git+https://github.com/huggingface/peft.git\n", + "!pip install -q -U git+https://github.com/huggingface/accelerate.git\n", + "!pip install -q datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "8cc86330c2af436c9af314e8c04c8c2b", + "e25f9ca445b14e3f8397779df071dfb4", + "8365680c634a44aa880317e36fa5e46e", + "0c4ac7c3db0b431397cc812f7c9e785c", + "c84f542c863043dea8a3675fa153e78d", + "b3b3f4ddd4ed4d938c923887939a0440", + "35186465f87341f683affb9399661540", + "791df472db174df69b8c9f0e200af254", + "6bb9c7182d2a464ea21809e59043562a", + "31c574113731403b88edc5bb0798bc6d", + "3b8bc5b9392e45758813a1db9db824a9", + "90661b333d6f496ca606b3046622660e", + "5f551f9b217e44cf8b5433f314b3844b", + "d2d81cc8296c4b10bf80b86c0a3302d3", + "7e3a386e672f4748882211227b7721a9", + "57f251691b4c453896b2508c431dfc2f", + "4bdb196cd1494f809829651ec5b6cbf8", + "7cd50bcc8fcc4b83abcda6d3604bd4cc", + "7a00aa4a97a34da39cc052c6926dbe13", + "14c73d88df9e46e3bbb6690fdb48ad07", + "0e2beab611114239b6ee48a3cbb09c49", + "006b78b5191b4fb888d98bdf6c20ec1e", + "5f6ffa1d929443a5bd9c7c550f0690f0", + "668a7f88506148a9ba2b48920afc028f", + "57b0096985ab44aea342e52795c4f999", + "a4c404e420cc4ce781ce569f9ab3f987", + "ee4e4af964ec4dd597cb04a90f0697f9", + "974e3687f18a4e1a975969b880d086aa", + "93a50117ece543d4857ba02505dc4514", + "71a3a56edbdb45669d382fef4b097e1b", + "53f287d4927541d08e2ae7d4d0b3c396", + "afa442ab223b46cb82569438c0047823" + ] + }, + "id": "wAAPv5CRmg7e", + "outputId": "687f979a-04c1-4160-d71c-4de8ecdb07d9" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8cc86330c2af436c9af314e8c04c8c2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HTML(value='

\n", + " \n", + " \n", + " [10/10 03:56, Epoch 0/1]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
11.276300
21.877700
31.983800
42.011400
51.997800
61.648100
71.576000
80.916400
91.523100
101.814500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=10, training_loss=1.662518608570099, metrics={'train_runtime': 269.3407, 'train_samples_per_second': 0.149, 'train_steps_per_second': 0.037, 'total_flos': 530537216679936.0, 'train_loss': 1.662518608570099, 'epoch': 0.004062563477554337})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import transformers\n", + "\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "trainer = transformers.Trainer(\n", + " model=model,\n", + " train_dataset=data[\"train\"],\n", + " args=transformers.TrainingArguments(\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=4,\n", + " warmup_steps=2,\n", + " max_steps=10,\n", + " learning_rate=2e-4,\n", + " fp16=True,\n", + " logging_steps=1,\n", + " output_dir=\"path/to/your/HF/repo\", # change it to your desired repo!\n", + " optim=\"paged_adamw_8bit\",\n", + " ),\n", + " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", + ")\n", + "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mr3rLrHwqhf6" + }, + "source": [ + "## Usage Example" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true, + "id": "9mrOJ9l8SMHv" + }, + "outputs": [], + "source": [ + "model.config.use_cache = True\n", + "model.eval();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 122 + }, + "id": "AM6FNOFzqKfI", + "outputId": "fdbe28b1-e440-45d3-bd6d-c15e744ad23d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "\"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. ### Human: What is the purpose of quantization in LLMs?### Assistant: Quantization is a technique used to reduce the size of a model without significantly impacting its performance. In the context of language models, quantization is the process of converting floating-point numbers (which are used to represent the weights and activations of a model) to smaller, fixed-point numbers. This can be done by grouping the weights into small chunks and assigning each chunk a single, fixed-point number. Quantization can significantly reduce the size of a model, making it more efficient to train and deploy. In addition, quantization can improve the performance of a model on low-power devices, such as mobile phones\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import GenerationConfig\n", + "\n", + "max_new_tokens = 120\n", + "top_p = 0.9\n", + "temperature = 0.7\n", + "user_question = \"What is the purpose of quantization in LLMs?\"\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + "\n", + "prompt = (\n", + " \"A chat between a curious human and an artificial intelligence assistant. \"\n", + " \"The assistant gives helpful, detailed, and polite answers to the user's questions. \"\n", + " \"### Human: {user_question}\"\n", + " \"### Assistant: \"\n", + ")\n", + "\n", + "\n", + "def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):\n", + " inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(device)\n", + "\n", + " outputs = model.generate(\n", + " **inputs,\n", + " generation_config=GenerationConfig(\n", + " do_sample=True,\n", + " max_new_tokens=max_new_tokens,\n", + " top_p=top_p,\n", + " temperature=temperature,\n", + " ),\n", + " )\n", + "\n", + " text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " # print(text)\n", + " return text\n", + "\n", + "\n", + "generate(model, user_question)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T5t_gl2_f5OO" + }, + "outputs": [], + "source": [ + "# trainer.push_to_hub()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.1.-1" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00371a48e64c45cd97020a78b710e64c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3f9fa554747743f8a86b40a4f7530617", + "placeholder": "​", + "style": "IPY_MODEL_ffe561df8772443ebf40a3b8b656079f", + "value": " 50.6k/50.6k [00:00<00:00, 3.65MB/s]" + } + }, + "006b78b5191b4fb888d98bdf6c20ec1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee4e4af964ec4dd597cb04a90f0697f9", + "placeholder": "​", + "style": "IPY_MODEL_974e3687f18a4e1a975969b880d086aa", + "value": "Your token has been saved in your configured git credential helpers (store)." + } + }, + "026072374b7d47c194707a50f5c99099": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "02d6cc4c2717434c895798601bda7c86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "03dd6c24f6d94fe7ab85b79d6f6cbeaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc5ce633746949ed98418cae9f68afe3", + "placeholder": "​", + "style": "IPY_MODEL_4b1f795c4c004cacbf3660d935e52995", + "value": "Downloading data: 100%" + } + }, + "04188e0cec0542818894ebc6a534fb51": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "061c45266c484ff6807dcaf4722fd73b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "068eb104d5d346b1897f8cbe9860d267": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0711e28e06a440c2a241acbc1f90d1e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f4ca7b63d7d749ff83a848e250f03ec1", + "placeholder": "​", + "style": "IPY_MODEL_8c149bc655a34fe5b91853c66db458a9", + "value": "model.safetensors.index.json: 100%" + } + }, + "076357d4bb9943bdaa1d6846897786af": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07e0aed682fd4cc88fa75c0592dc04a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47944ad8cadf4a57b170193c46d4389c", + "placeholder": "​", + "style": "IPY_MODEL_db05b25cb38140bdb21e6f3b7fde7e66", + "value": "model-00001-of-00004.safetensors: 100%" + } + }, + "082b6990ce5e4812adc0ad6a7b376dac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0aad2d9d1cba40cbb64308ede3242ed7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0711e28e06a440c2a241acbc1f90d1e8", + "IPY_MODEL_77704d2e27e94cd3a0c5f6b5ceeffd1c", + "IPY_MODEL_3b82b8d41b134bec9bd77ed8d4f00eb4" + ], + "layout": "IPY_MODEL_25fef90e209f4b14a73f3e39d226d913" + } + }, + "0b145e421f4840f2872c29256b49f168": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0c4ac7c3db0b431397cc812f7c9e785c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_90661b333d6f496ca606b3046622660e", + "style": "IPY_MODEL_5f551f9b217e44cf8b5433f314b3844b", + "value": true + } + }, + "0d2ae3466a3447c58e23ccd2b3733deb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_921a1a037f7b47f8b57d1da8192a437a", + "placeholder": "​", + "style": "IPY_MODEL_892ff4e2f0e44c23bc5c2be7547cf0bd", + "value": "generation_config.json: 100%" + } + }, + "0e2beab611114239b6ee48a3cbb09c49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57b0096985ab44aea342e52795c4f999", + "placeholder": "​", + "style": "IPY_MODEL_a4c404e420cc4ce781ce569f9ab3f987", + "value": "Token is valid (permission: write)." + } + }, + "0ec2643d9fd44785addb37d9ecd23989": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f4c664612364dc89acf78eb1c740980": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0f54e8fda93144f6a95493e6ec535e9d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0f60f9aa76b941809e013ffcae83604a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "124a70bfad434c5c946f611c04a91c8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "14bf612f6ad7416c8ddd6085c72eee0e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "14c73d88df9e46e3bbb6690fdb48ad07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1561cd47c42e46368677d34e7b7084cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04188e0cec0542818894ebc6a534fb51", + "placeholder": "​", + "style": "IPY_MODEL_4a13203d132b45beadf140c02dc8a566", + "value": " 518/518 [00:00<00:00, 976.36 examples/s]" + } + }, + "156f95b0012449e8a0c604e6e03bf35f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17c797e08bd2493fa685918129415309": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "191caf3a38eb4191a35f623ce25238f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1ae1d2702da5483a85504f59939ffa39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8cd63d3908e4411c9fcb42bc32c8dd16", + "max": 73, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_64624b26145b42db82f7afc36c32e117", + "value": 73 + } + }, + "1b2abf90003e4165a3293acd6a5ea9ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cffdf12fbe97462ab74e88ccca943aeb", + "placeholder": "​", + "style": "IPY_MODEL_bcaf4c81ba9d437bb6223dbb22d011ed", + "value": " 73.0/73.0 [00:00<00:00, 4.75kB/s]" + } + }, + "1bd0a270c7ee409c970763398e54fc36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_076357d4bb9943bdaa1d6846897786af", + "placeholder": "​", + "style": "IPY_MODEL_7b3e136fc9e74a699497a947006f4f1d", + "value": " 1.11M/1.11M [00:00<00:00, 8.23MB/s]" + } + }, + "1d27ab2bc6ae463a806292b68b7891f8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f0efc167b3744b38ff832b71d529318": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_64911f0e52e74067a1a986c5edfc7f59", + "placeholder": "​", + "style": "IPY_MODEL_b4a274fc9e324b80bf559c4dbd05e319", + "value": "Downloading readme: 100%" + } + }, + "1f59dd66813f419999336e59a3efc56a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f75d85e6c7e4eb6a91b03f0c8adb644": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f82a5685eef4b47a2dbf7618362907c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "21bf14b771c14d2dab9e98a326302e14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "23012118a7314a3f838870a2aee9ec90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e297072ab5d64815b90bc89d22503378", + "placeholder": "​", + "style": "IPY_MODEL_67fbabb9082c4241b8f937b24e0cdd03", + "value": " 395/395 [00:00<00:00, 16.6kB/s]" + } + }, + "2540d57e3bf545e3812da1ee72b85fc8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2558c2dd7d394ecf9fc67a69ce8fc97a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "25fc6aaf37fc49fa822df29236bf2f90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "25fef90e209f4b14a73f3e39d226d913": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27a587021d854b79a279a510a55f9d73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d4092198673141d3b4a824d629d73f64", + "placeholder": "​", + "style": "IPY_MODEL_d3cbfd564fe8485ba7afdb1cc54abed3", + "value": "Map: 100%" + } + }, + "2a57bb48e1c6475abba242994a79d44a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ab86b3fbd49488bb02f8205a572e752": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "30b74bd2db8d40d08408013cebcd7661": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "317cda72329c4043ab0b224b46b259d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31c10fa464e24f97b379675a204a09b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d27ab2bc6ae463a806292b68b7891f8", + "max": 654, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2a57bb48e1c6475abba242994a79d44a", + "value": 654 + } + }, + "31c574113731403b88edc5bb0798bc6d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "347540dc03d34e65b7ffbb0f5fc569aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34e381adbd9242759b57f2a305c5d2e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35186465f87341f683affb9399661540": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "353bf45a4bbc46d6a798175f152399cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_740604526cc44cd58b811827d4787d96", + "placeholder": "​", + "style": "IPY_MODEL_2558c2dd7d394ecf9fc67a69ce8fc97a", + "value": "Generating test split: 100%" + } + }, + "35c2c635c2024bcda3265bf95d330f63": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "37523a6cac1047e9a261698212d47737": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_191caf3a38eb4191a35f623ce25238f9", + "max": 20877686, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30b74bd2db8d40d08408013cebcd7661", + "value": 20877686 + } + }, + "3b614b9712874fac990d2c557b0791a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3b82b8d41b134bec9bd77ed8d4f00eb4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_14bf612f6ad7416c8ddd6085c72eee0e", + "placeholder": "​", + "style": "IPY_MODEL_f7e59b47f9b74523843f37268212d566", + "value": " 23.9k/23.9k [00:00<00:00, 1.51MB/s]" + } + }, + "3b8bc5b9392e45758813a1db9db824a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d9d8278667d496aaea1eaaa4d24ae93": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3e45aea9f7444a4db885c4cca4c9c4ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f9fa554747743f8a86b40a4f7530617": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4227474e986546d1a7d31dce35a2410c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7e51e8e0612e46b1a3403d448b39aa50", + "max": 518, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_061c45266c484ff6807dcaf4722fd73b", + "value": 518 + } + }, + "42eb041021214110a860924d28d73409": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4c369386ba5f4862b11a50e50130663b", + "placeholder": "​", + "style": "IPY_MODEL_bbdf3bb657e64fc2b0a90e78e8886480", + "value": " 5.00G/5.00G [00:24<00:00, 249MB/s]" + } + }, + "434fe18d50a14920b30fd2d0650297ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_353bf45a4bbc46d6a798175f152399cb", + "IPY_MODEL_bfcbfe4184774fd3a8320f4f0e1baf54", + "IPY_MODEL_99c5c846cc5e43429905f071670b4310" + ], + "layout": "IPY_MODEL_8d988c86648244788f6dc5aa0fea38fd" + } + }, + "43d12a98d90a4bf7a96c033172c646e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c35b16156253402f90a432f3f07c2e0a", + "IPY_MODEL_1ae1d2702da5483a85504f59939ffa39", + "IPY_MODEL_1b2abf90003e4165a3293acd6a5ea9ff" + ], + "layout": "IPY_MODEL_3e45aea9f7444a4db885c4cca4c9c4ff" + } + }, + "47944ad8cadf4a57b170193c46d4389c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a13203d132b45beadf140c02dc8a566": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4a7c8dfd88db4bc893da2bced0560d47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_27a587021d854b79a279a510a55f9d73", + "IPY_MODEL_4227474e986546d1a7d31dce35a2410c", + "IPY_MODEL_1561cd47c42e46368677d34e7b7084cd" + ], + "layout": "IPY_MODEL_8b9e961c837a464fb7a8c44756dc41e7" + } + }, + "4b1f795c4c004cacbf3660d935e52995": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4bc1fd9d480a4799954c69031c071b30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_068eb104d5d346b1897f8cbe9860d267", + "placeholder": "​", + "style": "IPY_MODEL_b621c6a8c0e9440fa840d75a1b1b02fc", + "value": " 9846/9846 [00:00<00:00, 38881.08 examples/s]" + } + }, + "4bdb196cd1494f809829651ec5b6cbf8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c369386ba5f4862b11a50e50130663b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4c8e98294bd240a6869cb199caee66e1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e1f5423311b4dc0930c21c9ad5a88f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "51180cce01564821a170d1d4b8a9a918": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_03dd6c24f6d94fe7ab85b79d6f6cbeaf", + "IPY_MODEL_f2ab2fa803e94328a237e84cd4ea0027", + "IPY_MODEL_1bd0a270c7ee409c970763398e54fc36" + ], + "layout": "IPY_MODEL_e92b30d0b4234af6b5a33bff989b1b45" + } + }, + "51b3af446ace409dbcdf5de499552061": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53f287d4927541d08e2ae7d4d0b3c396": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "546b76a22f1046cd856a8fa2f9ff2d9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5690d92586494b9187147f32fa708405": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57b0096985ab44aea342e52795c4f999": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57f251691b4c453896b2508c431dfc2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57f87d4780634d36ae8159d987c22993": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_58ea619f81bf42ddb8b166db3deb0e86", + "IPY_MODEL_8bb83ae3229e4f38b1733f92f536fad0", + "IPY_MODEL_5c0104210ee34ca8a072ee5121f424a1" + ], + "layout": "IPY_MODEL_34e381adbd9242759b57f2a305c5d2e3" + } + }, + "58ea619f81bf42ddb8b166db3deb0e86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e4e1a4338c5e46b3ba5a3bb960da7107", + "placeholder": "​", + "style": "IPY_MODEL_9a5072b8d16d4a1eb0652da61bda0ac8", + "value": "Loading checkpoint shards: 100%" + } + }, + "5924b266e95a42039634a334ff561a82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_eadeec171e7b4c0f9e26964f031cfb71", + "IPY_MODEL_feae525923d5407bb69a922954c474f7", + "IPY_MODEL_00371a48e64c45cd97020a78b710e64c" + ], + "layout": "IPY_MODEL_156f95b0012449e8a0c604e6e03bf35f" + } + }, + "5962e77eea5a4d88ba6dbc5e9f51c709": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5a8ac674153248999007a713299b2644": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5b2a671976fa446db408d58a215b8249": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_63899ac621ff4e9cb8e215d5ab63bef8", + "max": 4, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6b2b59d2b62b4f7da8c60ff783138397", + "value": 4 + } + }, + "5b56ac3009714a5a84dd8749db4a7bce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_51b3af446ace409dbcdf5de499552061", + "placeholder": "​", + "style": "IPY_MODEL_9a12124915994b70a71ebd64b99e93e9", + "value": " 1.17G/1.17G [00:09<00:00, 45.8MB/s]" + } + }, + "5c0104210ee34ca8a072ee5121f424a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_604582e8cbff4dc9876551a3307b5b77", + "placeholder": "​", + "style": "IPY_MODEL_a98165ee656643ad85ac9ea1447cc775", + "value": " 4/4 [01:13<00:00, 15.74s/it]" + } + }, + "5cf4a57d21a545029b6448258a5ebd84": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0d2ae3466a3447c58e23ccd2b3733deb", + "IPY_MODEL_ba7f32c41f9247ec9d4c40e6396b55a9", + "IPY_MODEL_ea1bdb5f2da64332960bccd967a84b4a" + ], + "layout": "IPY_MODEL_b08631e4cffa445c912da0c8eac2ef23" + } + }, + "5f551f9b217e44cf8b5433f314b3844b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5f60910d1e744432bdf87518f0f45874": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f6ffa1d929443a5bd9c7c550f0690f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_93a50117ece543d4857ba02505dc4514", + "placeholder": "​", + "style": "IPY_MODEL_71a3a56edbdb45669d382fef4b097e1b", + "value": "Your token has been saved to /root/.cache/huggingface/token" + } + }, + "5fb4a4ef8afe4ea4af6655faea17f354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "604582e8cbff4dc9876551a3307b5b77": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "610e1ddfb7a44d51a54ebea6dad3a5f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6149752353fe4f9cbb7b26bcc25199a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6cb8065803724d80b82b06dc95ded91e", + "placeholder": "​", + "style": "IPY_MODEL_8301c6302df54bbc9f15295f11cec208", + "value": " 654/654 [00:00<00:00, 46.9kB/s]" + } + }, + "63899ac621ff4e9cb8e215d5ab63bef8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "63ac7dafeb27446cb30aaddf4cd27c9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "64624b26145b42db82f7afc36c32e117": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "64911f0e52e74067a1a986c5edfc7f59": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "661f76474252493caae8f7d6aa8f99b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83c355e1418140a5bbad11bf0646b332", + "placeholder": "​", + "style": "IPY_MODEL_84d6d2a6afcd423f9b609cbb2d10f00e", + "value": " 20.9M/20.9M [00:00<00:00, 44.7MB/s]" + } + }, + "668a7f88506148a9ba2b48920afc028f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_53f287d4927541d08e2ae7d4d0b3c396", + "placeholder": "​", + "style": "IPY_MODEL_afa442ab223b46cb82569438c0047823", + "value": "Login successful" + } + }, + "67b4473eb8a44a96ba34983762ab38fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f474268da0f4337a2ccecc1ca2098a1", + "max": 4976698672, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c2ceccfdb59b4336a24003cd6bc2403d", + "value": 4976698672 + } + }, + "67fbabb9082c4241b8f937b24e0cdd03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6b0ec8d5f7294d44a5fa15d8ef12471e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8ea89e52123643268857285e0e1db1c0", + "placeholder": "​", + "style": "IPY_MODEL_a8514e34378d47a28fbf0831a14ede8f", + "value": "tokenizer.json: 100%" + } + }, + "6b2b59d2b62b4f7da8c60ff783138397": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6b6ed29053ec4aaa8fc5526a35f17c2b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6bb9c7182d2a464ea21809e59043562a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6cb8065803724d80b82b06dc95ded91e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f474268da0f4337a2ccecc1ca2098a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7037c32dfce84e70ac86537dbbc6a495": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f4c664612364dc89acf78eb1c740980", + "placeholder": "​", + "style": "IPY_MODEL_0f60f9aa76b941809e013ffcae83604a", + "value": "config.json: 100%" + } + }, + "71a3a56edbdb45669d382fef4b097e1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7243d8e2e1cc4043a2ee310eabd0ac09": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "740604526cc44cd58b811827d4787d96": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7504986b8d8d4d0da58ad79e80a81948": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7725e9d443e249ada02e5ac7056d00db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "77704d2e27e94cd3a0c5f6b5ceeffd1c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f54e8fda93144f6a95493e6ec535e9d", + "max": 23950, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_880124db7dc04aaea09edd75e1ec7921", + "value": 23950 + } + }, + "791df472db174df69b8c9f0e200af254": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a00aa4a97a34da39cc052c6926dbe13": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ac8e88f29f04b859f592a003d39836b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b3e136fc9e74a699497a947006f4f1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7cd50bcc8fcc4b83abcda6d3604bd4cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a00aa4a97a34da39cc052c6926dbe13", + "placeholder": "​", + "style": "IPY_MODEL_14c73d88df9e46e3bbb6690fdb48ad07", + "value": "Connecting..." + } + }, + "7d3a7be9ed6f48988a2c4a1a4a2271cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f86b969ef69b48119619e1a424b50460", + "max": 9085698, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7725e9d443e249ada02e5ac7056d00db", + "value": 9085698 + } + }, + "7e2e097c703a4a0d8556733a0739469c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e0fd6d00f0ba4e59bdaa5779556ec4ea", + "IPY_MODEL_cd318c6bfc8e421a9bfcdab16be5eaa7", + "IPY_MODEL_4bc1fd9d480a4799954c69031c071b30" + ], + "layout": "IPY_MODEL_25fc6aaf37fc49fa822df29236bf2f90" + } + }, + "7e3a386e672f4748882211227b7721a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "7e51e8e0612e46b1a3403d448b39aa50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "800e9453214848b69bc4c6ca2d5e8f79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6b0ec8d5f7294d44a5fa15d8ef12471e", + "IPY_MODEL_7d3a7be9ed6f48988a2c4a1a4a2271cf", + "IPY_MODEL_c40f583823574e40b6b29d4914143c0e" + ], + "layout": "IPY_MODEL_a4368e6da8f046aaa32f3152b7d333d1" + } + }, + "8301c6302df54bbc9f15295f11cec208": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8365680c634a44aa880317e36fa5e46e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_31c574113731403b88edc5bb0798bc6d", + "placeholder": "​", + "style": "IPY_MODEL_3b8bc5b9392e45758813a1db9db824a9", + "value": "" + } + }, + "83c355e1418140a5bbad11bf0646b332": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "849cdc1912aa4df4b0c721a8c63ca0f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "84d6d2a6afcd423f9b609cbb2d10f00e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "854e35df771f470b82a59f878a2a6a46": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8800c351b6da450eace0c3890d36c8d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7037c32dfce84e70ac86537dbbc6a495", + "IPY_MODEL_31c10fa464e24f97b379675a204a09b5", + "IPY_MODEL_6149752353fe4f9cbb7b26bcc25199a9" + ], + "layout": "IPY_MODEL_0b145e421f4840f2872c29256b49f168" + } + }, + "880124db7dc04aaea09edd75e1ec7921": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "88024cd312ee42c2925ebfbe52077780": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "892ff4e2f0e44c23bc5c2be7547cf0bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8a7c82dcbd414b24b67ccfbc562b2e38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8b9e961c837a464fb7a8c44756dc41e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8bb83ae3229e4f38b1733f92f536fad0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1f75d85e6c7e4eb6a91b03f0c8adb644", + "max": 4, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9da33f07ea354b5798e85298e132b017", + "value": 4 + } + }, + "8c149bc655a34fe5b91853c66db458a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c4d6f4eea3742289a2604e66b0c6182": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8cc86330c2af436c9af314e8c04c8c2b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0e2beab611114239b6ee48a3cbb09c49", + "IPY_MODEL_006b78b5191b4fb888d98bdf6c20ec1e", + "IPY_MODEL_5f6ffa1d929443a5bd9c7c550f0690f0", + "IPY_MODEL_668a7f88506148a9ba2b48920afc028f" + ], + "layout": "IPY_MODEL_35186465f87341f683affb9399661540" + } + }, + "8cd63d3908e4411c9fcb42bc32c8dd16": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d0f1d547c384094b10aa00a3ede3c06": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_082b6990ce5e4812adc0ad6a7b376dac", + "placeholder": "​", + "style": "IPY_MODEL_610e1ddfb7a44d51a54ebea6dad3a5f0", + "value": "model-00003-of-00004.safetensors: 100%" + } + }, + "8d988c86648244788f6dc5aa0fea38fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ea89e52123643268857285e0e1db1c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f5b8c513b164dab9e0892422163c483": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cd11fb7d54bb43ae821f2272d075a1b3", + "placeholder": "​", + "style": "IPY_MODEL_fbc6a2834c5442fbb6667f1b3612bb5b", + "value": "Downloading shards: 100%" + } + }, + "90661b333d6f496ca606b3046622660e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "921a1a037f7b47f8b57d1da8192a437a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "93a50117ece543d4857ba02505dc4514": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "974e3687f18a4e1a975969b880d086aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "99529129d7f0435da0fdcfc9803a2f11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5a8ac674153248999007a713299b2644", + "max": 1168138808, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1f82a5685eef4b47a2dbf7618362907c", + "value": 1168138808 + } + }, + "99c5c846cc5e43429905f071670b4310": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2c543008f444cf49972a4f35c32b8e3", + "placeholder": "​", + "style": "IPY_MODEL_bb7b8a9e42f6478f851236685a1392d6", + "value": " 518/518 [00:00<00:00, 13408.85 examples/s]" + } + }, + "9a0b012915c54abeb100f466fa99d303": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fffbf696c07744fc8e3d81ab51dc9c90", + "placeholder": "​", + "style": "IPY_MODEL_a153cc3ca0cc45c18a941bd57e363ec3", + "value": "model-00004-of-00004.safetensors: 100%" + } + }, + "9a12124915994b70a71ebd64b99e93e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a5072b8d16d4a1eb0652da61bda0ac8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9bdebf06b6874bbb88404f4ad14e1dbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f60910d1e744432bdf87518f0f45874", + "max": 4915916176, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2ab86b3fbd49488bb02f8205a572e752", + "value": 4915916176 + } + }, + "9cfaf17064bc49a5aded0fc53dd7cd7f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ede66e196fa9482498f58dcdffd494a2", + "IPY_MODEL_a26cc7fea1a64d7bac1769d33cc74e28", + "IPY_MODEL_c2f24a8930be4b70b4bbbcf5d908b01d" + ], + "layout": "IPY_MODEL_1f59dd66813f419999336e59a3efc56a" + } + }, + "9da33f07ea354b5798e85298e132b017": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9f13437a44b8434b9cc3afab998e8d3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a06b2bd0236249999adffa44e53cf80e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5fb4a4ef8afe4ea4af6655faea17f354", + "max": 395, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b1a03a5e9bae46129830daeeb23bf6ff", + "value": 395 + } + }, + "a153cc3ca0cc45c18a941bd57e363ec3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a2249f364b914662b54045a1f8d6dfd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f13437a44b8434b9cc3afab998e8d3c", + "placeholder": "​", + "style": "IPY_MODEL_8a7c82dcbd414b24b67ccfbc562b2e38", + "value": " 4.92G/4.92G [00:32<00:00, 171MB/s]" + } + }, + "a26cc7fea1a64d7bac1769d33cc74e28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_854e35df771f470b82a59f878a2a6a46", + "max": 9846, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d1cbe0ab9379453588eb438d13fd272d", + "value": 9846 + } + }, + "a2a7b715b16a41a288209dee1de5d2d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2c543008f444cf49972a4f35c32b8e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a34b3fd5859a441f89cbe7f6e6df9da9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8f5b8c513b164dab9e0892422163c483", + "IPY_MODEL_5b2a671976fa446db408d58a215b8249", + "IPY_MODEL_c934919f617447cfb9226929e7a68d79" + ], + "layout": "IPY_MODEL_124a70bfad434c5c946f611c04a91c8f" + } + }, + "a4368e6da8f046aaa32f3152b7d333d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4c404e420cc4ce781ce569f9ab3f987": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a53b4776f95f4dd38197193e6c5f649e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aea74071600f483b9e6de1a61743c03a", + "placeholder": "​", + "style": "IPY_MODEL_21bf14b771c14d2dab9e98a326302e14", + "value": "model-00002-of-00004.safetensors: 100%" + } + }, + "a8514e34378d47a28fbf0831a14ede8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a8999d04e4114693bb6be358bdbe9b83": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a98165ee656643ad85ac9ea1447cc775": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aea74071600f483b9e6de1a61743c03a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "afa442ab223b46cb82569438c0047823": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b08631e4cffa445c912da0c8eac2ef23": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0b7457a8b47496483da1506fb2505b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1a03a5e9bae46129830daeeb23bf6ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b1de7b283eeb41828e8093e60c83f2c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bb640a5c858349d29c13ce5629e72f22", + "IPY_MODEL_37523a6cac1047e9a261698212d47737", + "IPY_MODEL_661f76474252493caae8f7d6aa8f99b7" + ], + "layout": "IPY_MODEL_849cdc1912aa4df4b0c721a8c63ca0f9" + } + }, + "b2a19b6092c44b20886987b30f1bf48a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1f0efc167b3744b38ff832b71d529318", + "IPY_MODEL_a06b2bd0236249999adffa44e53cf80e", + "IPY_MODEL_23012118a7314a3f838870a2aee9ec90" + ], + "layout": "IPY_MODEL_dcff079d850c423a83eb70105b816ee4" + } + }, + "b3b3f4ddd4ed4d938c923887939a0440": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57f251691b4c453896b2508c431dfc2f", + "placeholder": "​", + "style": "IPY_MODEL_4bdb196cd1494f809829651ec5b6cbf8", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks.

" + } + }, + "b4a274fc9e324b80bf559c4dbd05e319": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b4ba435f6d1c448f99b533bc6df32e76": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35c2c635c2024bcda3265bf95d330f63", + "max": 4999802720, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ec014d847e394a309b6a82c30a6fdfc5", + "value": 4999802720 + } + }, + "b621c6a8c0e9440fa840d75a1b1b02fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ba7f32c41f9247ec9d4c40e6396b55a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4c8e98294bd240a6869cb199caee66e1", + "max": 177, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4e1f5423311b4dc0930c21c9ad5a88f5", + "value": 177 + } + }, + "bac377ed96ae4e8db9b298bb623888ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb640a5c858349d29c13ce5629e72f22": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5962e77eea5a4d88ba6dbc5e9f51c709", + "placeholder": "​", + "style": "IPY_MODEL_f07c8a6ec12f46ea9e32a2208e70bccd", + "value": "Downloading data: 100%" + } + }, + "bb7b8a9e42f6478f851236685a1392d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bbdf3bb657e64fc2b0a90e78e8886480": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bcaf4c81ba9d437bb6223dbb22d011ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bf77e5aaab0547f7b2beb015687552ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bfcbfe4184774fd3a8320f4f0e1baf54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2a7b715b16a41a288209dee1de5d2d1", + "max": 518, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf77e5aaab0547f7b2beb015687552ef", + "value": 518 + } + }, + "c27e8ce031884a90b41d8220b1870bc4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2ceccfdb59b4336a24003cd6bc2403d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c2f24a8930be4b70b4bbbcf5d908b01d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0b7457a8b47496483da1506fb2505b3", + "placeholder": "​", + "style": "IPY_MODEL_c7dc386d978a44ff885763ecec94dc38", + "value": " 9846/9846 [00:09<00:00, 1066.17 examples/s]" + } + }, + "c35b16156253402f90a432f3f07c2e0a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b6ed29053ec4aaa8fc5526a35f17c2b", + "placeholder": "​", + "style": "IPY_MODEL_e69cd88ccbae4bb7b238fa112a60f0f9", + "value": "special_tokens_map.json: 100%" + } + }, + "c40f583823574e40b6b29d4914143c0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f81756eb9e554899b0778311f2c407c4", + "placeholder": "​", + "style": "IPY_MODEL_3b614b9712874fac990d2c557b0791a6", + "value": " 9.09M/9.09M [00:00<00:00, 19.3MB/s]" + } + }, + "c56d8289513441688f9bc5f4b52d60a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a53b4776f95f4dd38197193e6c5f649e", + "IPY_MODEL_b4ba435f6d1c448f99b533bc6df32e76", + "IPY_MODEL_42eb041021214110a860924d28d73409" + ], + "layout": "IPY_MODEL_17c797e08bd2493fa685918129415309" + } + }, + "c7dc386d978a44ff885763ecec94dc38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c7e06fd82f7f4f9fb81c68e8758f2de1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8999d04e4114693bb6be358bdbe9b83", + "placeholder": "​", + "style": "IPY_MODEL_2540d57e3bf545e3812da1ee72b85fc8", + "value": " 4.98G/4.98G [00:34<00:00, 232MB/s]" + } + }, + "c84f542c863043dea8a3675fa153e78d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_d2d81cc8296c4b10bf80b86c0a3302d3", + "style": "IPY_MODEL_7e3a386e672f4748882211227b7721a9", + "tooltip": "" + } + }, + "c934919f617447cfb9226929e7a68d79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c27e8ce031884a90b41d8220b1870bc4", + "placeholder": "​", + "style": "IPY_MODEL_88024cd312ee42c2925ebfbe52077780", + "value": " 4/4 [01:41<00:00, 22.30s/it]" + } + }, + "cc5ce633746949ed98418cae9f68afe3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd11fb7d54bb43ae821f2272d075a1b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd318c6bfc8e421a9bfcdab16be5eaa7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d10ba011d05045b18bbfeb9660e4d9d3", + "max": 9846, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e56c22f77c884caaacfafd48dfa51a55", + "value": 9846 + } + }, + "cf6d1be81b6c4ffc81ce8fdabfc5ad28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_07e0aed682fd4cc88fa75c0592dc04a7", + "IPY_MODEL_67b4473eb8a44a96ba34983762ab38fa", + "IPY_MODEL_c7e06fd82f7f4f9fb81c68e8758f2de1" + ], + "layout": "IPY_MODEL_3d9d8278667d496aaea1eaaa4d24ae93" + } + }, + "cffdf12fbe97462ab74e88ccca943aeb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d10ba011d05045b18bbfeb9660e4d9d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d1508f5cde9a43d8abc26dd2d0c34dbd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9a0b012915c54abeb100f466fa99d303", + "IPY_MODEL_99529129d7f0435da0fdcfc9803a2f11", + "IPY_MODEL_5b56ac3009714a5a84dd8749db4a7bce" + ], + "layout": "IPY_MODEL_546b76a22f1046cd856a8fa2f9ff2d9f" + } + }, + "d1cbe0ab9379453588eb438d13fd272d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d2d81cc8296c4b10bf80b86c0a3302d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d3cbfd564fe8485ba7afdb1cc54abed3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d4092198673141d3b4a824d629d73f64": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d7ef74cf4a914ad38a69c84c34fff393": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8d0f1d547c384094b10aa00a3ede3c06", + "IPY_MODEL_9bdebf06b6874bbb88404f4ad14e1dbc", + "IPY_MODEL_a2249f364b914662b54045a1f8d6dfd1" + ], + "layout": "IPY_MODEL_7504986b8d8d4d0da58ad79e80a81948" + } + }, + "db05b25cb38140bdb21e6f3b7fde7e66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dcff079d850c423a83eb70105b816ee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de3757d6125a4c07b502dd60816bafec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0fd6d00f0ba4e59bdaa5779556ec4ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ac8e88f29f04b859f592a003d39836b", + "placeholder": "​", + "style": "IPY_MODEL_0ec2643d9fd44785addb37d9ecd23989", + "value": "Generating train split: 100%" + } + }, + "e25f9ca445b14e3f8397779df071dfb4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_791df472db174df69b8c9f0e200af254", + "placeholder": "​", + "style": "IPY_MODEL_6bb9c7182d2a464ea21809e59043562a", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "e297072ab5d64815b90bc89d22503378": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4e1a4338c5e46b3ba5a3bb960da7107": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e56c22f77c884caaacfafd48dfa51a55": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e69cd88ccbae4bb7b238fa112a60f0f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e92b30d0b4234af6b5a33bff989b1b45": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ea1bdb5f2da64332960bccd967a84b4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_347540dc03d34e65b7ffbb0f5fc569aa", + "placeholder": "​", + "style": "IPY_MODEL_7243d8e2e1cc4043a2ee310eabd0ac09", + "value": " 177/177 [00:00<00:00, 11.4kB/s]" + } + }, + "eadeec171e7b4c0f9e26964f031cfb71": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de3757d6125a4c07b502dd60816bafec", + "placeholder": "​", + "style": "IPY_MODEL_8c4d6f4eea3742289a2604e66b0c6182", + "value": "tokenizer_config.json: 100%" + } + }, + "ec014d847e394a309b6a82c30a6fdfc5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ede66e196fa9482498f58dcdffd494a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_026072374b7d47c194707a50f5c99099", + "placeholder": "​", + "style": "IPY_MODEL_63ac7dafeb27446cb30aaddf4cd27c9f", + "value": "Map: 100%" + } + }, + "ee4e4af964ec4dd597cb04a90f0697f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f07c8a6ec12f46ea9e32a2208e70bccd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2ab2fa803e94328a237e84cd4ea0027": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5690d92586494b9187147f32fa708405", + "max": 1105272, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_317cda72329c4043ab0b224b46b259d3", + "value": 1105272 + } + }, + "f4ca7b63d7d749ff83a848e250f03ec1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7e59b47f9b74523843f37268212d566": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f81756eb9e554899b0778311f2c407c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f86b969ef69b48119619e1a424b50460": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fbc6a2834c5442fbb6667f1b3612bb5b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "feae525923d5407bb69a922954c474f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bac377ed96ae4e8db9b298bb623888ec", + "max": 50566, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_02d6cc4c2717434c895798601bda7c86", + "value": 50566 + } + }, + "ffe561df8772443ebf40a3b8b656079f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fffbf696c07744fc8e3d81ab51dc9c90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/peft/examples/dora_finetuning/README.md b/peft/examples/dora_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..91ffb2a529f6fb88c03a1e7e7ae8de07b9304969 --- /dev/null +++ b/peft/examples/dora_finetuning/README.md @@ -0,0 +1,105 @@ +# DoRA: Weight-Decomposed Low-Rank Adaptation + +![dora](https://i.ytimg.com/vi/m7KQdGSr0Dg/maxresdefault.jpg) + + +## Introduction +[DoRA](https://huggingface.co/papers/2402.09353) is a novel approach that leverages low rank adaptation through weight decomposition analysis to investigate the inherent differences between full fine-tuning and LoRA. DoRA initially decomposes the pretrained weight into its magnitude and directional components and finetunes both of them. Because the directional component is large in terms of parameter numbers, we further decompose it with LoRA for efficient finetuning. This results in enhancing both the learning capacity and training stability of LoRA while avoiding any additional inference overhead. + +## Quick start +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") +dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") +lora_config = LoraConfig( + use_dora=True +) +peft_model = get_peft_model(model, lora_config) +trainer = transformers.Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + tokenizer=tokenizer, +) +trainer.train() +peft_model.save_pretrained("dora-llama-3-8b") +``` + +There is no additional change needed to your standard LoRA procedure, except for specifying `use_dora = True` option in your lora configuration. + + +Run the finetuning script simply by running: +```bash +python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco +``` +This 👆🏻 by default will load the model in peft set up with LoRA config. Now if you wanna quickly compare it with Dora, all you need to do is to input ` --use_dora` in the command line. So same above example would be 👇🏻; + +```bash +python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco --use_dora +``` + +DoRA also supports quantization. To use 4-bit quantization try: + +```bash +python examples/dora_finetuning/dora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --quantize +``` + +Similarly, by default the LoRA layers are the attention and MLP layers of LLama model, if you get to choose a different set of layers for LoRA to be applied on, you can simply define it using: +```bash +python examples/dora_finetuning/dora_finetuning.py --lora_target_modules "q_proj,k_proj,v_proj,o_proj" +``` + +### Full example of the script +```bash +python dora_finetuning.py \ + --base_model "PATH_TO_MODEL" \ + --data_path "PATH_TO_DATASET" \ + --output_dir "PATH_TO_OUTPUT_DIR" \ + --batch_size 1 \ + --num_epochs 3 \ + --learning_rate 3e-4 \ + --cutoff_len 512 \ + --val_set_size 500 \ + --use_dora \ + --quantize \ + --eval_step 10 \ + --save_step 100 \ + --lora_r 16 \ + --lora_alpha 32 \ + --lora_dropout 0.05 \ + --lora_target_modules "q_proj,k_proj,v_proj,o_proj" \ + --hub_model_id "YOUR_HF_REPO" \ + --push_to_hub +``` +## Use the model on 🤗 +You can load and use the model as any other 🤗 models. +```python +from transformers import AutoModel +model = AutoModel.from_pretrained("ShirinYamani/huggyllama-llama-7b-finetuned") +``` + +## DoRA vs. LoRA +In general, DoRA finetuning on diffusion models is still experimental and is likely to require different hyperparameter values to perform best compared to LoRA. + +Specifically, people have noticed 2 differences to take into account in your training: + +1. LoRA seem to converge faster than DoRA (so a set of parameters that may lead to overfitting when training a LoRA may be working well for a DoRA) + +2. DoRA quality superior to LoRA especially in lower ranks: The difference in quality of DoRA of rank 8 and LoRA of rank 8 appears to be more significant than when training ranks of 32 or 64 for example. + + +## Citation +``` +@article{liu2024dora, + title={DoRA: Weight-Decomposed Low-Rank Adaptation}, + author={Liu, Shih-Yang and Wang, Chien-Yi and Yin, Hongxu and Molchanov, Pavlo and Wang, Yu-Chiang Frank and Cheng, Kwang-Ting and Chen, Min-Hung}, + journal={arXiv preprint arXiv:2402.09353}, + year={2024} +} +``` \ No newline at end of file diff --git a/peft/examples/dora_finetuning/dora_finetuning.py b/peft/examples/dora_finetuning/dora_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..c8d281fc617a96958b96fbbb91fb2514a875a03e --- /dev/null +++ b/peft/examples/dora_finetuning/dora_finetuning.py @@ -0,0 +1,208 @@ +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + use_dora: bool, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + lora_r: int, + lora_alpha: int, + lora_dropout: float, + lora_target_modules: str, + hub_model_id: str, + push_to_hub: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + # Setup device + if device == "auto": + device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + else: + device = torch.device(device) + print(f"Using device: {device}") + + # load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + + # QDoRA (quantized dora): IF YOU WANNA QUANTIZE THE MODEL + if quantize: + if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) or torch.xpu.is_available(): + bnb_4bit_compute_dtype = torch.bfloat16 + else: + bnb_4bit_compute_dtype = torch.float16 + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + ) + # setup for quantized training + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token) + # LoRa config for the PEFT model + lora_config = LoraConfig( + use_dora=use_dora, # to use Dora OR compare to Lora just set the --use_dora + r=lora_r, # Rank of matrix + lora_alpha=lora_alpha, + target_modules=( + lora_target_modules.split(",") + if lora_target_modules + else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + ), + lora_dropout=lora_dropout, + bias="none", + ) + + # get the peft model with LoRa config + model = get_peft_model(model, lora_config) + + model.to(device) # MODEL TO GPU/CUDA + tokenizer.pad_token = tokenizer.eos_token + + # Load the dataset + dataset = load_dataset(data_path) + + def tokenize_function(examples): + inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len) + inputs["labels"] = inputs["input_ids"].copy() # setting labels for a language modeling task + return inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Define training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + ) + + # Clear device cache to free memory + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + # Start model training + trainer.train() + + # Save and push the trained model and tokenizer + if push_to_hub: + # Push the main model to the hub + trainer.push_to_hub(commit_message="Fine-tuned model") + + # Save the model and tokenizer locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune LLaMA with DoRA and PEFT") + parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument("--use_dora", action="store_true", help="Apply Dora") + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="auto", help="Device to use for training") + parser.add_argument("--lora_r", type=int, default=8, help="LoRA rank") + parser.add_argument("--lora_alpha", type=int, default=16, help="LoRA alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + use_dora=args.use_dora, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + lora_r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + ) diff --git a/peft/examples/ephemeral_gpu_offloading/load_with_dora.py b/peft/examples/ephemeral_gpu_offloading/load_with_dora.py new file mode 100644 index 0000000000000000000000000000000000000000..cc61925f9008af2f526c145fca9689587d75cd65 --- /dev/null +++ b/peft/examples/ephemeral_gpu_offloading/load_with_dora.py @@ -0,0 +1,103 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Example script demonstrating the time difference loading a model with a DoRA using ephemeral GPU offloading vs doing it purely on the CPU. + +Example outputs: +$ python load_with_dora.py +--- Loading model --- +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.03s/it] +--- Loading PeftModel --- +--- Done --- +Model loading time: 4.83s +PeftModel loading time: 28.14s +Use ephemeral GPU offloading: False + +(Note: if this was the first time you ran the script, or if your cache was cleared, the times shown above are invalid, due to the time taken to download the model and DoRA files. Just re-run the script in this case.) + +$ python load_with_dora.py --ephemeral_gpu_offload +--- Loading model --- +Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00, 1.11it/s] +--- Loading PeftModel --- +--- Done --- +Model loading time: 4.28s +PeftModel loading time: 16.59s +Use ephemeral GPU offloading: True + +(Note: if this was the first time you ran the script, or if your cache was cleared, the times shown above are invalid, due to the time taken to download the model and DoRA files. Just re-run the script in this case.) +""" + +import argparse +import time + +from huggingface_hub import snapshot_download +from transformers import AutoModelForCausalLM + +from peft import PeftModel + + +def main(): + parser = argparse.ArgumentParser(description="Load a model with DoRA using ephemeral GPU offloading") + parser.add_argument("--model", type=str, default="NousResearch/Hermes-2-Pro-Mistral-7B", help="Model to load") + parser.add_argument( + "--dora", + type=str, + default="peft-internal-testing/DoRA-Hermes-2-Pro-Mistral-7B", + help="DoRA to use", + ) + parser.add_argument("--ephemeral_gpu_offload", action="store_true", help="Use ephemeral GPU offloading") + parser.add_argument( + "--merge_model_path", type=str, help="Merge the model with the DoRA model and save to the given path" + ) + args = parser.parse_args() + + peft_model_kwargs = { + "ephemeral_gpu_offload": args.ephemeral_gpu_offload, + "max_memory": {"cpu": "256GiB"}, + "device_map": {"": "cpu"}, + } + + # Predownload + try: + snapshot_download(repo_id=args.model) + except Exception as e: + print(f"Failed to download model: {e}") + # We continue anyway as this might be e.g. a local directory or something + try: + snapshot_download(repo_id=args.dora) + except Exception as e: + print(f"Failed to download DoRA: {e}") + # We continue anyway as this might be e.g. a local directory or something + + start = time.perf_counter() + print("--- Loading model ---") + model = AutoModelForCausalLM.from_pretrained(args.model) + model_time = time.perf_counter() - start + print("--- Loading PeftModel ---") + peft_model = PeftModel.from_pretrained(model, args.dora, **peft_model_kwargs) + print("--- Done ---") + peft_model_time = time.perf_counter() - start + + print(f"Model loading time: {model_time:.2f}s") + print(f"PeftModel loading time: {peft_model_time:.2f}s") + print(f"Use ephemeral GPU offloading: {args.ephemeral_gpu_offload}") + + if args.merge_model_path is not None: + merged_model = peft_model.merge_and_unload(progressbar=True) + merged_model.save_pretrained(args.merge_model_path) + + +if __name__ == "__main__": + main() diff --git a/peft/examples/eva_finetuning/README.md b/peft/examples/eva_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c710410059719eb7b50b043fa94437a45bfec28 --- /dev/null +++ b/peft/examples/eva_finetuning/README.md @@ -0,0 +1,159 @@ +# EVA: Explained Variance Adaptation +## Introduction ([Paper](https://huggingface.co/papers/2410.07170), [code](https://github.com/ml-jku/EVA)) +Explained Variance Adaptation (EVA) is a novel initialization method for LoRA style adapters which initializes adapter weights in a data driven manner and adaptively allocates ranks according to the variance they explain. EVA improves average performance on a multitude of tasks across various domains, such as Language generation and understanding, Image classification, and Decision Making. + +The abstract from the paper is: + +*Foundation models (FMs) are pre-trained on large-scale datasets and then fine-tuned on a downstream task for a specific application. The most successful and most commonly used fine-tuning method is to update the pre-trained weights via a low-rank adaptation (LoRA). LoRA introduces new weight matrices that are usually initialized at random with a uniform rank distribution across model weights. Recent works focus on weight-driven initialization or learning of adaptive ranks during training. Both approaches have only been investigated in isolation, resulting in slow convergence or a uniform rank distribution, in turn leading to sub-optimal performance. We propose to enhance LoRA by initializing the new weights in a data-driven manner by computing singular value decomposition on minibatches of activation vectors. Then, we initialize the LoRA matrices with the obtained right-singular vectors and re-distribute ranks among all weight matrices to explain the maximal amount of variance and continue the standard LoRA fine-tuning procedure. This results in our new method **E**xplained **V**ariance **A**daptation (EVA). We apply EVA to a variety of fine-tuning tasks ranging from language generation and understanding to image classification and reinforcement learning. EVA exhibits faster convergence than competitors and attains the highest average score across a multitude of tasks per domain.* + +## Quick Start +Below is an example of how to use EVA with a causal language model. For a more detailed example see [eva_finetuning.py](https://github.com/huggingface/peft/blob/main/examples/eva_finetuning/eva_finetuning.py). +```python +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import EvaConfig, LoraConfig, get_peft_model, initialize_lora_eva_weights + + +# config +model_name = "meta-llama/Llama-3.1-8B" +max_seq_len = 512 +rank = 16 +alpha = 1 +rho = 2.0 +target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] +svd_batch_size = 4 # can be different from the batch size used in finetuning + +# load model and tokenizer +model = AutoModelForCausalLM.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token + +# load dataset +dataset = load_dataset("Rowan/hellaswag") +dataset = dataset.map( + lambda x: tokenizer(x["ctx"], padding="max_length", truncation=True, max_length=max_seq_len), + batched=True, + remove_columns=dataset["train"].column_names, +) +dataset.set_format(type="torch") + +# create dataloader for SVD +# typically this is the same as the dataloader used for finetuning +dataloader = DataLoader( + dataset["train"], + batch_size=svd_batch_size, + collate_fn=lambda examples: {k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys()}, +) + +# setup peft config +eva_config = EvaConfig( + rho=rho +) +peft_config = LoraConfig( + r=rank, + lora_alpha=alpha, + target_modules=target_modules, + init_lora_weights="eva", + eva_config=eva_config +) + +# move model to accelerator +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +model = model.to(device) + +# to optimize memory usage during EVA initialization, set low_cpu_mem_usage=True +peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) + +initialize_lora_eva_weights(peft_model, dataloader) +``` +`initialize_lora_eva_weights` will compute the SVD and load the components into the model. After this continue with standard LoRA finetuning. + +## Using EVA with Bitsandbytes +EVA is fully compatible with bitsandbytes. Simply initialize the pretrained model with a BitsAndBytesConfig and then use the peft model with EVA. +```python +from transformers import BitsAndBytesConfig +from peft import prepare_model_for_kbit_training + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-3.1-8B", + quantization_config=BitsAndBytesConfig(load_in_4bit=True) +) +model = prepare_model_for_kbit_training(model) +peft_model = get_peft_model(model, peft_config) +initialize_lora_eva_weights(peft_model, dataloader) +``` + +## Getting the EVA state_dict without loading the adapter weights +In some cases you might just want to get the state_dict after EVA initialization without loading the adapter weights. This can be useful for example if: +- you want to precompute and store the state_dict for different downstream tasks. +- you need to quantize the model for finetuning but want to perform EVA initialization with model weights in full/half precision. +- you do not intend to use a peft model for LoRA finetuning. +- you would like to leverage multiple accelerators for EVA initialization. (At the moment this is not directly supported by `initialize_lora_eva_weights`) + +You can do this by calling `get_eva_state_dict` directly (you only need to pass `peft_config` if `model` is not a PeftModel): +```python +from peft import get_eva_state_dict + +eva_state_dict = get_eva_state_dict(model, dataloader, peft_config) +``` +Later you can load the state_dict into a `PeftModel` by using the `eva_state_dict` argument in `initialize_lora_eva_weights`: +```python +initialize_lora_eva_weights(peft_model, eva_state_dict=eva_state_dict) +``` + +## Leveraging multiple accelerators + +EVA initialization can be parallelized across multiple accelerators. In this case inputs from multiple accelerators are gathered before computing the SVD for the batch. This requires that the model is wrapped in a `torch.nn.DataParallel` or `torch.nn.DistributedDataParallel` class. An example of how to use this can be found in [eva_finetuning_multi_accelerator.py](https://github.com/huggingface/peft/blob/main/examples/eva_finetuning/eva_finetuning_multi_accelerator.py). + +## Customizing EVA + +By default, EVA is designed to work with standard transformer language models. However we integrated three different parameters which can be used to customize EVA for other types of models. +1. `forward_fn`: Defines how the forward pass during EVA initialization should be computed. +2. `prepare_model_inputs_fn`: Can be used if it is necessary to use information contained in the original model_input to prepare the input for SVD in individual layers. +3. `prepare_layer_inputs_fn`: Defines how layer inputs should be prepared for SVD. + +All three parameters can be passed to `initialize_lora_eva_weights` and `get_eva_state_dict`. + +### forward_fn + +`forward_fn` defines how the forward pass during EVA initialization should be computed. `forward_fn` receives two arguments: `model` and `inputs`. By default this is set to `forward_fn_dict` which simply returns `model(**inputs)`. + +### prepare_model_inputs_fn + +`prepare_model_inputs_fn` can be used if it is necessary to use information contained in the original model_input to prepare the input for SVD in individual layers. `prepare_model_inputs_fn` receives two arguments: `model_input` and `peft_config`. This component is separate from `prepare_layer_inputs_fn` as the output only needs to be computed once per batch. By default this parameter is set to `prepare_model_inputs_fn_language_modeling` which is used get a subset of indices based on attention and label mask to avoid including padding tokens in the SVD computation. If you would like to not use this component set `prepare_model_inputs_fn` to None. The default logic is: +```python +def prepare_model_inputs_fn_language_modeling(model_input, peft_config: LoraConfig): + mask = model_input.get("attention_mask", torch.ones_like(model_input["input_ids"])).bool() + if peft_config.eva_config.use_label_mask and hasattr(model_input, "labels"): + mask = torch.logical_and(mask, model_input["labels"] != peft_config.eva_config.label_mask_value) + return mask.nonzero() +``` + +### prepare_layer_inputs_fn + +`prepare_layer_inputs_fn` can be used to preprocess the layer inputs before passing them to the SVD algorithm. `prepare_layer_inputs_fn` receives three arguments: `layer_input`, `model_input` and `layer_name`. It can either be a callable or a dictionary where the keys are the layer names and the values are callables. If it is a dictionary, functions are assigned to adapter layers based on the layer names. By default a language modeling setting is assumed where model_inputs are the outputs of `prepare_model_inputs_fn_language_modeling` which is a mask of indices. If this parameter is set to None, only two modifications are made to the layer inputs +- take the first element incase of a tuple or list. +- if the input has more than 2 dimensions, we flatten all but the last dimension. + +Must always return a tensor. The default logic is: +```python +def prepare_layer_inputs_fn_default(layer_input, model_input, layer_name) -> torch.Tensor: + if isinstance(layer_input, (tuple, list)): + layer_input = layer_input[0] + return layer_input[model_input.T.unbind()] +``` + +## Citation +In case you find our work useful, please consider citing it. + +``` +@article{paischer2024eva, + title={One Initialization to Rule them All: Fine-tuning via Explained Variance Adaptation}, + author={Fabian Paischer, Lukas Hauzenberger, Thomas Schmied, Benedikt Alkin, Marc Peter Deisenroth, Sepp Hochreiter}, + journal={arXiv preprint arXiv:2410.07170}, + year={2024} +} +``` diff --git a/peft/examples/eva_finetuning/eva_finetuning.py b/peft/examples/eva_finetuning/eva_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..4901770f0b58170fea3e2ecb9bb5fb6746d2be9e --- /dev/null +++ b/peft/examples/eva_finetuning/eva_finetuning.py @@ -0,0 +1,96 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments +from utils import DataCollator, TokenizerMetaMath + +from peft import EvaConfig, LoraConfig, get_peft_model, initialize_lora_eva_weights + + +DEVICE = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +# config +model_name = "meta-llama/Llama-3.1-8B" +max_seq_len = 512 +rank = 16 +alpha = 1 +rho = 2.0 +target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] +svd_batch_size = 4 # can be different from the batch size used in finetuning +batch_size = 4 +learning_rate = 5e-4 +gradient_accumulation_steps = 8 +num_epochs = 1 +output_dir = "outputs" +bf16 = True + + +# load model and tokenizer +model = AutoModelForCausalLM.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# load dataset +dataset = load_dataset("meta-math/MetaMathQA") +dataset = dataset.map( + TokenizerMetaMath(model_name), + batched=True, + remove_columns=dataset["train"].column_names, +) +dataset.set_format(type="torch") + +# data collator +data_collator = DataCollator(tokenizer.eos_token_id, max_length=max_seq_len) + +# dataloader +dataloader = DataLoader( + dataset["train"], + batch_size=svd_batch_size, + collate_fn=data_collator, +) + +# setup peft config +eva_config = EvaConfig(rho=rho) +peft_config = LoraConfig( + r=rank, lora_alpha=alpha, target_modules=target_modules, init_lora_weights="eva", eva_config=eva_config +) + +# move model to accelerator +model = model.to(DEVICE) + +# to optimize memory usage during eva initialization, set low_cpu_mem_usage=True +peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) +initialize_lora_eva_weights(peft_model, dataloader) + +# setup training arguments +training_args = TrainingArguments( + per_device_train_batch_size=batch_size, + learning_rate=learning_rate, + gradient_accumulation_steps=gradient_accumulation_steps, + num_train_epochs=num_epochs, + output_dir=output_dir, + remove_unused_columns=False, + bf16=bf16, +) + +# continue with standard finetuning +trainer = Trainer( + model=peft_model, + args=training_args, + train_dataset=dataset["train"], + data_collator=data_collator, +) +trainer.train() diff --git a/peft/examples/eva_finetuning/eva_finetuning_multi_accelerator.py b/peft/examples/eva_finetuning/eva_finetuning_multi_accelerator.py new file mode 100644 index 0000000000000000000000000000000000000000..447137e3b13e79adee85676493fc37d0da316a60 --- /dev/null +++ b/peft/examples/eva_finetuning/eva_finetuning_multi_accelerator.py @@ -0,0 +1,132 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +import torch.distributed as dist +from datasets import load_dataset +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments +from utils import DataCollator, TokenizerMetaMath + +from peft import EvaConfig, LoraConfig, get_eva_state_dict, get_peft_model, initialize_lora_eva_weights + + +# run this script e.g. with: torchrun --nproc_per_node=4 eva_finetuning_multi_gpu.py + +# config +model_name = "meta-llama/Llama-2-7b-hf" +max_seq_len = 512 +rank = 16 +alpha = 1 +rho = 2.0 +target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"] +svd_batch_size = 4 # can be different from the batch size used in finetuning +batch_size = 4 +learning_rate = 5e-4 +gradient_accumulation_steps = 8 +num_epochs = 1 +output_dir = "outputs" +bf16 = True + + +# Initialize distributed environment +if torch.cuda.is_available(): + local_rank = int(os.environ.get("LOCAL_RANK", -1)) + torch.cuda.set_device(local_rank) + dist.init_process_group("nccl") + world_size = dist.get_world_size() +elif torch.xpu.is_available(): + local_rank = int(os.environ.get("LOCAL_RANK", -1)) + torch.xpu.set_device(local_rank) + dist.init_process_group("xccl") + world_size = dist.get_world_size() +else: + local_rank = -1 + world_size = 1 + + +# load model and tokenizer +model = AutoModelForCausalLM.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# load dataset +dataset = load_dataset("meta-math/MetaMathQA") +dataset = dataset.map( + TokenizerMetaMath(model_name), + batched=True, + remove_columns=dataset["train"].column_names, +) +dataset.set_format(type="torch") + +# data collator +data_collator = DataCollator(tokenizer.eos_token_id, max_length=max_seq_len) + +# Create sampler for distributed training +sampler = DistributedSampler(dataset["train"], num_replicas=world_size, rank=local_rank) + +# dataloader +dataloader = DataLoader( + dataset["train"], + batch_size=svd_batch_size, + collate_fn=data_collator, + sampler=sampler, + shuffle=False, +) + +sampler.set_epoch(0) + +# Wrap model in DDP +model = model.to(local_rank) +model = DDP(model, device_ids=[local_rank], output_device=local_rank) + +# setup peft config +eva_config = EvaConfig(rho=rho) +peft_config = LoraConfig( + r=rank, lora_alpha=alpha, target_modules=target_modules, init_lora_weights="eva", eva_config=eva_config +) + +# EVA initialization +eva_state_dict = get_eva_state_dict(model, dataloader, peft_config) +eva_state_dict = {".".join(["base_model.model"] + k.split(".")[1:]): v for k, v in eva_state_dict.items()} + +# cleanup ddp +model = model.module + +# initialize peft model +peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=True) +initialize_lora_eva_weights(peft_model, eva_state_dict=eva_state_dict) + +# setup training arguments +training_args = TrainingArguments( + per_device_train_batch_size=batch_size, + learning_rate=learning_rate, + gradient_accumulation_steps=gradient_accumulation_steps, + num_train_epochs=num_epochs, + output_dir=output_dir, + remove_unused_columns=False, + bf16=bf16, +) + +# continue with standard finetuning +trainer = Trainer( + model=peft_model, + args=training_args, + train_dataset=dataset["train"], + data_collator=data_collator, +) +trainer.train() diff --git a/peft/examples/eva_finetuning/utils.py b/peft/examples/eva_finetuning/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..df7f069a07e461e7851e9515cf912497ff28c310 --- /dev/null +++ b/peft/examples/eva_finetuning/utils.py @@ -0,0 +1,76 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from transformers import AutoTokenizer + + +class TokenizerMetaMath: + PROMPT_NO_INPUT = ( + "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{query}\n\n### Response: " + ) + PROMPT = ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{query}\n\n### Input:\n{input}\n\n### Response: " + ) + + def format_prompt(self, query): + query = query.split("\n", 1) + if len(query) == 1 or query[1].strip("\n") == "": + return self.PROMPT_NO_INPUT.format(query=query[0]) + else: + return self.PROMPT.format(query=query[0], input=query[1]) + + def __init__(self, tokenizer_path): + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + + def __call__(self, examples): + prompts = [self.format_prompt(text) for text in examples["query"]] + completions = examples["response"] + return self._tokenize_fn(prompts, completions) + + def _tokenize_fn(self, prompts, completions): + prompt_tokens = self.tokenizer(prompts, add_special_tokens=False)["input_ids"] + input_tokens = self.tokenizer([x + y for x, y in zip(prompts, completions)], add_special_tokens=False)[ + "input_ids" + ] + input_tokens = [[self.tokenizer.bos_token_id] + x + [self.tokenizer.eos_token_id] for x in input_tokens] + prompt_length = [len(x) + 1 for x in prompt_tokens] # +1 for the bos token + input_length = [len(x) for x in input_tokens] + return {"input_ids": input_tokens, "prompt_length": prompt_length, "input_length": input_length} + + +class DataCollator: + def __init__(self, eos_token_id, max_length=None): + self.eos_token_id = eos_token_id + self.max_length = max_length + + def __call__(self, batch): + batch = {k: [item[k] for item in batch] for k in batch[0]} + input_lengths = torch.stack(batch["input_length"]) + prompt_lengths = torch.stack(batch["prompt_length"]) + input_ids = torch.nn.utils.rnn.pad_sequence( + batch["input_ids"], batch_first=True, padding_value=self.eos_token_id + ) + col_indices = torch.arange(input_ids.size(1)).unsqueeze(0) + attention_mask = col_indices < input_lengths.unsqueeze(1) + label_mask = torch.logical_or(col_indices < prompt_lengths.unsqueeze(1), ~attention_mask) + labels = input_ids.masked_fill(label_mask, -100) + if self.max_length is not None: + input_ids = input_ids[:, : self.max_length] + attention_mask = attention_mask[:, : self.max_length] + labels = labels[:, : self.max_length] + return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} diff --git a/peft/examples/evaluation/lora-lm-eval.ipynb b/peft/examples/evaluation/lora-lm-eval.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..253c6da49b9a3346f3a8e27c6c83282f57efadb3 --- /dev/null +++ b/peft/examples/evaluation/lora-lm-eval.ipynb @@ -0,0 +1,4252 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "qAkXdLL2D25p" + }, + "source": [ + "## Peft model evaluation using [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness)\n", + "\n", + "In this notebook, we are going to learn how to evaluate the finetuned lora model on the hellaswag task using lm-eval-harness toolkit." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "o52TJHcYD25q", + "outputId": "c5482c79-ff56-4ffa-d20c-46c3d30d2cd5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33m DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m DEPRECATION: Building 'sqlitedict' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'sqlitedict'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m DEPRECATION: Building 'word2number' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'word2number'. Discussion can be found at https://github.com/pypa/pip/issues/6334\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "# Install LM-Eval\n", + "!pip install -q datasets evaluate lm_eval" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uhUflrJXD25q" + }, + "source": [ + "### First we will check the accuracy score on the hellaswag task for the base bert without finetuning" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hwJIYD5KD25q", + "outputId": "51e69f81-d048-46b2-9699-658d3ffc5f08" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7b1ea8948a0747bc98795d6459270044", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4ec51e06812446899b66826c41697f8d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "data/train-00000-of-00001.parquet: 0%| | 0.00/24.4M [00:00\n", + " \n", + " \n", + " [3910/3910 40:13, Epoch 5/5]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation LossAccuracy
10.3538000.2612580.901160
20.2774000.2216510.912480
30.2445000.2161070.918200
40.1970000.2152570.920040
50.1577000.2150500.923240

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7298a140779d4fd88a65a191af265821", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading builder script: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3c6b99b4b5854527a8b34b92a8d2986b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading builder script: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=3910, training_loss=0.24082870385835847, metrics={'train_runtime': 2416.0772, 'train_samples_per_second': 51.737, 'train_steps_per_second': 1.618, 'total_flos': 3.300271872e+16, 'train_loss': 0.24082870385835847, 'epoch': 5.0})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure training arguments\n", + "training_args = TrainingArguments(\"bert-lora-imdb\",\n", + " eval_strategy=\"epoch\",\n", + " per_device_train_batch_size=32, # decrease this for OOM error\n", + " per_device_eval_batch_size=64,\n", + " save_strategy=\"epoch\",\n", + " learning_rate=2e-3,\n", + " num_train_epochs=5,\n", + " weight_decay=0.01,\n", + " load_best_model_at_end=True,\n", + " do_eval=True,\n", + " do_predict=True,\n", + " metric_for_best_model=\"accuracy\",\n", + " report_to=\"none\")\n", + "\n", + "# Initialize the Trainer for the model training loop\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "#start training\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "34h3g_eED25s" + }, + "source": [ + "### Now take the finetuned lora checkpoint and check the accuracy score on hellaswag task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7tgAq7nLD25s" + }, + "outputs": [], + "source": [ + "# use the path of your checkpoint here\n", + "output = lm_eval.simple_evaluate(model = 'hf',\n", + " model_args = {\n", + " 'pretrained' : 'bert-base-cased',\n", + " 'peft' : './bert-lora-imdb/checkpoint-3910',\n", + " 'dtype' : 'bfloat16'},\n", + " tasks = 'hellaswag',\n", + " device = device,\n", + " batch_size = 128,\n", + " log_samples = False)\n", + "\n", + "output[\"results\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kaggle": { + "accelerator": "nvidiaTeslaT4", + "dataSources": [], + "dockerImageVersionId": 30787, + "isGpuEnabled": true, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00a9858d90d6430eaab54f9e013f077b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "03f7cfca9e634cf69e3cc70f24832ba3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_886443acd2f14cff93059c093f98cc1b", + "placeholder": "​", + "style": "IPY_MODEL_24abe5089abc4ecfab75f7601bc98e68", + "value": " 25000/25000 [00:25<00:00, 1037.33 examples/s]" + } + }, + "0649702ad9764ad8bf3dfbaa6739686e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0bee940b8667495f9e685f7ba6c3706f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0dc109378f1c43a7b18b1be03cce32ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db789da312a24fc9944ecb8b617109e7", + "placeholder": "​", + "style": "IPY_MODEL_0bee940b8667495f9e685f7ba6c3706f", + "value": " 7.81k/7.81k [00:00<00:00, 580kB/s]" + } + }, + "10335d3ada7f428588c4faa3f57bbd51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "122d7dd4d02d4df0b6573e100b5e46e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_db0b4bb4c7a642fb9e8d5c7738e90afc", + "IPY_MODEL_f8b71d5cb37549fba6559e2d83531319", + "IPY_MODEL_30d18e09421b42c985904750e68740d1" + ], + "layout": "IPY_MODEL_f67b5bbf7ccb475abe41e08316dc5b37" + } + }, + "15dc2c4e42ab48c9ad09aafff29f9278": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1fea1738e9fe48629b09e2ec9351fcd2", + "IPY_MODEL_2e2fc856557e40df8400e4b69f7143fc", + "IPY_MODEL_bce88474ca6745da99d79bc07216333c" + ], + "layout": "IPY_MODEL_99bab394a68140f79def33bc6f6499b2" + } + }, + "16afa8cf9ca64a59afd7a4c4f293b479": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17c7d8bb89184c26966a33dc27ef5517": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1882e91f0b264cbeb90b99a69c7de7f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8e862faf12804eaebfd7692db348842b", + "placeholder": "​", + "style": "IPY_MODEL_7812641b79ae46d48bc88b4c773344c0", + "value": " 21.0M/21.0M [00:00<00:00, 189MB/s]" + } + }, + "1c07d8f701604da0989d5f8d88d4bbcd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f8d90231390a4211b698e700a66fcb0f", + "placeholder": "​", + "style": "IPY_MODEL_aa7697dbb00641f19491f13b1a643197", + "value": " 50000/50000 [00:00<00:00, 142668.00 examples/s]" + } + }, + "1dfc470241c44ce1a0f9ae71fdfdbdf6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da7f0a799616427ba7e93b0080d26d37", + "placeholder": "​", + "style": "IPY_MODEL_9931cc064e2c400e9830e448c8ef4655", + "value": " 20.5M/20.5M [00:00<00:00, 214MB/s]" + } + }, + "1fea1738e9fe48629b09e2ec9351fcd2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_60695bb251124317a897a6fc56b754ac", + "placeholder": "​", + "style": "IPY_MODEL_d0582455fe3c449dbde19a47561770b4", + "value": "Generating train split: 100%" + } + }, + "20cd23dc1cd840c89741957f3fcbfdb8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_16afa8cf9ca64a59afd7a4c4f293b479", + "max": 50000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a025adc548b4fcb8c5637f8f6dabc81", + "value": 50000 + } + }, + "24abe5089abc4ecfab75f7601bc98e68": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "24cbed7481774ef793a8f204ba5b604b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2bbd92cebbf445d087bcadf82625c6d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2be4c676b5834100b31d7f42ab8bab85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2cd718fb166641d59c8df64cbc637d9c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2d2adb2b7a3b41d28736a8b7aba258b1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d46765961fe453597645a0b56a9cbc7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d76c76bc4a6433b8fe2b28a1c887ada": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_31aa04d4f32a411293f2f729889984b8", + "IPY_MODEL_87de47c8821f423d9efc5c7e85297e32", + "IPY_MODEL_5f0dfd26cb484695b85d021a5d687503" + ], + "layout": "IPY_MODEL_d1414b66bc0f4a088c5e4551e8f4ee72" + } + }, + "2e2fc856557e40df8400e4b69f7143fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_92b9e59f9038485e839598237ec3fd8c", + "max": 25000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d69df0074246435f8481fd803863cdb1", + "value": 25000 + } + }, + "30d18e09421b42c985904750e68740d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_71a7cf52600f4288bd725b7bb93e7299", + "placeholder": "​", + "style": "IPY_MODEL_e3082a5e8a4144f5982ad478d9a54a2c", + "value": " 50000/50000 [00:44<00:00, 1214.24 examples/s]" + } + }, + "3142f29a154c4a33a161237d4c605c50": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9b39cdb9f7a14ab4865f360f3c1537dc", + "IPY_MODEL_798c00bf640e483cbc4fea744b268461", + "IPY_MODEL_1882e91f0b264cbeb90b99a69c7de7f5" + ], + "layout": "IPY_MODEL_17c7d8bb89184c26966a33dc27ef5517" + } + }, + "31664fd452bb43e3aefb87542c747b74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31aa04d4f32a411293f2f729889984b8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da58efd0f2b5442c8a284a948f2614f8", + "placeholder": "​", + "style": "IPY_MODEL_2cd718fb166641d59c8df64cbc637d9c", + "value": "Map: 100%" + } + }, + "31bad1f4c7c047a280d490b854a6e911": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "336ffca0a89e4255a62564ec2600318c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34f8bcc1d9954fb8ba8ecca7a6bd04cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_336ffca0a89e4255a62564ec2600318c", + "max": 25000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bbd92cebbf445d087bcadf82625c6d5", + "value": 25000 + } + }, + "38fd040b3a0d44d2adf13c2476f4505a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a9548c2f9fd54f73abc3e9c3c0bc9fda", + "placeholder": "​", + "style": "IPY_MODEL_faa9b111dd9745a29bf7494b95619a1b", + "value": "test-00000-of-00001.parquet: 100%" + } + }, + "3a99edeb7d5e43048fdce29b880c19a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_81eef7f1d0c7461cb443b996a5d5163f", + "max": 20470363, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b832fdc7655b4f88a15e19fc8381db47", + "value": 20470363 + } + }, + "3cc5521074d3411cbc24d0348d3fc314": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3d6b95a9f8774341be1976f10fb74679": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3ec913f1b93d4097ad9729156295f9e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "404088a4057546968f4e8cfc9e7461e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "47789088bcfa4c96a5fd898812c23d17": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8ac5b5baeefb4078a74d8c8b2fed6d93", + "placeholder": "​", + "style": "IPY_MODEL_4bbb4bf49e50489abc875881958c00aa", + "value": "Map: 100%" + } + }, + "4a025adc548b4fcb8c5637f8f6dabc81": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4bbb4bf49e50489abc875881958c00aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c55b2c1daa4497abf9c9f53e23f83b8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4f7b2a1359bf41cab2ed5663643509a6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fe02a8771814e22b8d954cfdd8b9f86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d52e5c49b80348fbb55ab39ce0a13f7e", + "IPY_MODEL_51f4dabc59d04f0c89128d41d4c184a1", + "IPY_MODEL_803fcea81b7b47fb91fb108e2170fa75" + ], + "layout": "IPY_MODEL_ee5722120e1045e985e5e4ca29a2e192" + } + }, + "51f4dabc59d04f0c89128d41d4c184a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d6b95a9f8774341be1976f10fb74679", + "max": 41996509, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3ec913f1b93d4097ad9729156295f9e9", + "value": 41996509 + } + }, + "55cab288802d49efb930c7641b036f44": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "568c0efe6ced432d81f45af4acaa921e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "580a3b71b23f4a72be9e8633c04e9276": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5b346ce1eaf649d195fa7dd6058dd196", + "max": 25000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d7b4684e53d445e58de1fe155315e093", + "value": 25000 + } + }, + "5b346ce1eaf649d195fa7dd6058dd196": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5ce04c799bb0430386e39af4734e80e6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f0dfd26cb484695b85d021a5d687503": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b1c37874948c459583a9f33dc77a6f55", + "placeholder": "​", + "style": "IPY_MODEL_24cbed7481774ef793a8f204ba5b604b", + "value": " 25000/25000 [00:23<00:00, 1166.56 examples/s]" + } + }, + "60695bb251124317a897a6fc56b754ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6eea630bd65745739ff646fbb172f426": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "716d24b9c1b340a2bf9045b0bf4e7e34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_47789088bcfa4c96a5fd898812c23d17", + "IPY_MODEL_580a3b71b23f4a72be9e8633c04e9276", + "IPY_MODEL_03f7cfca9e634cf69e3cc70f24832ba3" + ], + "layout": "IPY_MODEL_987a633e24594228b82e162397a63141" + } + }, + "71a7cf52600f4288bd725b7bb93e7299": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7812641b79ae46d48bc88b4c773344c0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7823f9d79e1d4350b385e6dfef84b021": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c006e4b791204d10a9d8e7fbc4bceb81", + "placeholder": "​", + "style": "IPY_MODEL_31664fd452bb43e3aefb87542c747b74", + "value": "Generating unsupervised split: 100%" + } + }, + "798c00bf640e483cbc4fea744b268461": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d406b18b855d488cac92b9b96073ba43", + "max": 20979968, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eee89779493748d38c01bb0a74b29e38", + "value": 20979968 + } + }, + "7f78e58f4e9c457a9c2d121759efdb09": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f1574df0debf4e1b8617e24ffcc39e16", + "placeholder": "​", + "style": "IPY_MODEL_a9cbf2bbb1f14894886617ce8e60de12", + "value": " 25000/25000 [00:00<00:00, 104177.91 examples/s]" + } + }, + "803fcea81b7b47fb91fb108e2170fa75": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b29cde97f90f4c489c5cdfd007c96d4f", + "placeholder": "​", + "style": "IPY_MODEL_d8c8ee9f63b14182a9ded152435c510f", + "value": " 42.0M/42.0M [00:00<00:00, 163MB/s]" + } + }, + "806a2b3f4c4c4ba59370a46c0f8faa85": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_38fd040b3a0d44d2adf13c2476f4505a", + "IPY_MODEL_3a99edeb7d5e43048fdce29b880c19a5", + "IPY_MODEL_1dfc470241c44ce1a0f9ae71fdfdbdf6" + ], + "layout": "IPY_MODEL_feb83525a43a4c2d818f3ef1ae69d581" + } + }, + "81eef7f1d0c7461cb443b996a5d5163f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87de47c8821f423d9efc5c7e85297e32": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2be4c676b5834100b31d7f42ab8bab85", + "max": 25000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6eea630bd65745739ff646fbb172f426", + "value": 25000 + } + }, + "886443acd2f14cff93059c093f98cc1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ac5b5baeefb4078a74d8c8b2fed6d93": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e862faf12804eaebfd7692db348842b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8fcd7e9ed5f54287b5cda0ad52a277f4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92b9e59f9038485e839598237ec3fd8c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "93dd1cc84f26479caaa1ded80bbea5ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3cde05a07b2437b935083d6aac25913", + "max": 7809, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_31bad1f4c7c047a280d490b854a6e911", + "value": 7809 + } + }, + "987a633e24594228b82e162397a63141": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9931cc064e2c400e9830e448c8ef4655": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "99bab394a68140f79def33bc6f6499b2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9b39cdb9f7a14ab4865f360f3c1537dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_55cab288802d49efb930c7641b036f44", + "placeholder": "​", + "style": "IPY_MODEL_0649702ad9764ad8bf3dfbaa6739686e", + "value": "train-00000-of-00001.parquet: 100%" + } + }, + "9b4b309603db4847a0d94da76db15116": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_568c0efe6ced432d81f45af4acaa921e", + "placeholder": "​", + "style": "IPY_MODEL_f402e8e510d64d0ca4d4ae2c09a7ddfc", + "value": "README.md: 100%" + } + }, + "a9548c2f9fd54f73abc3e9c3c0bc9fda": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9cbf2bbb1f14894886617ce8e60de12": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aa7697dbb00641f19491f13b1a643197": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b1c37874948c459583a9f33dc77a6f55": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b29cde97f90f4c489c5cdfd007c96d4f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b3cde05a07b2437b935083d6aac25913": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b832fdc7655b4f88a15e19fc8381db47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bce88474ca6745da99d79bc07216333c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3cc5521074d3411cbc24d0348d3fc314", + "placeholder": "​", + "style": "IPY_MODEL_4c55b2c1daa4497abf9c9f53e23f83b8", + "value": " 25000/25000 [00:00<00:00, 113831.51 examples/s]" + } + }, + "c006e4b791204d10a9d8e7fbc4bceb81": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0582455fe3c449dbde19a47561770b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d1414b66bc0f4a088c5e4551e8f4ee72": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d406b18b855d488cac92b9b96073ba43": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d52e5c49b80348fbb55ab39ce0a13f7e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8fcd7e9ed5f54287b5cda0ad52a277f4", + "placeholder": "​", + "style": "IPY_MODEL_10335d3ada7f428588c4faa3f57bbd51", + "value": "unsupervised-00000-of-00001.parquet: 100%" + } + }, + "d69df0074246435f8481fd803863cdb1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d7b4684e53d445e58de1fe155315e093": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d8c8ee9f63b14182a9ded152435c510f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da58efd0f2b5442c8a284a948f2614f8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da7f0a799616427ba7e93b0080d26d37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "db0b4bb4c7a642fb9e8d5c7738e90afc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0e6445a2a774ae89729b7e2fb4a14b3", + "placeholder": "​", + "style": "IPY_MODEL_fdd99c599f9c4ac88c09939ec397ca46", + "value": "Map: 100%" + } + }, + "db1b2639fc4944bcbbdfbdaf9150409f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "db789da312a24fc9944ecb8b617109e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0e6445a2a774ae89729b7e2fb4a14b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3082a5e8a4144f5982ad478d9a54a2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e85940da7bc24b8bb29ce609ba6e5613": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f7b2a1359bf41cab2ed5663643509a6", + "placeholder": "​", + "style": "IPY_MODEL_db1b2639fc4944bcbbdfbdaf9150409f", + "value": "Generating test split: 100%" + } + }, + "ebf724c3ad1443e98763dd279e6fc996": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9b4b309603db4847a0d94da76db15116", + "IPY_MODEL_93dd1cc84f26479caaa1ded80bbea5ff", + "IPY_MODEL_0dc109378f1c43a7b18b1be03cce32ce" + ], + "layout": "IPY_MODEL_2d2adb2b7a3b41d28736a8b7aba258b1" + } + }, + "ee5722120e1045e985e5e4ca29a2e192": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eee89779493748d38c01bb0a74b29e38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f1574df0debf4e1b8617e24ffcc39e16": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f402e8e510d64d0ca4d4ae2c09a7ddfc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f4d35fb98b0048ca8bdbe856d182d561": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e85940da7bc24b8bb29ce609ba6e5613", + "IPY_MODEL_34f8bcc1d9954fb8ba8ecca7a6bd04cd", + "IPY_MODEL_7f78e58f4e9c457a9c2d121759efdb09" + ], + "layout": "IPY_MODEL_2d46765961fe453597645a0b56a9cbc7" + } + }, + "f67b5bbf7ccb475abe41e08316dc5b37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f8b71d5cb37549fba6559e2d83531319": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5ce04c799bb0430386e39af4734e80e6", + "max": 50000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_404088a4057546968f4e8cfc9e7461e1", + "value": 50000 + } + }, + "f8d90231390a4211b698e700a66fcb0f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "faa9b111dd9745a29bf7494b95619a1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fdd4b9937b7744d98ba1163efbe1310b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7823f9d79e1d4350b385e6dfef84b021", + "IPY_MODEL_20cd23dc1cd840c89741957f3fcbfdb8", + "IPY_MODEL_1c07d8f701604da0989d5f8d88d4bbcd" + ], + "layout": "IPY_MODEL_00a9858d90d6430eaab54f9e013f077b" + } + }, + "fdd99c599f9c4ac88c09939ec397ca46": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "feb83525a43a4c2d818f3ef1ae69d581": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/peft/examples/feature_extraction/peft_lora_embedding_semantic_search.py b/peft/examples/feature_extraction/peft_lora_embedding_semantic_search.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed8babb7ea00acf556d16e3abc60c85d650f3d9 --- /dev/null +++ b/peft/examples/feature_extraction/peft_lora_embedding_semantic_search.py @@ -0,0 +1,502 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import math +import os +import random +from pathlib import Path + +import datasets +import evaluate +import torch +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from datasets import DatasetDict, load_dataset +from huggingface_hub import HfApi +from torch import nn +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModel, AutoTokenizer, SchedulerType, default_data_collator, get_scheduler + +from peft import LoraConfig, TaskType, get_peft_model + + +logger = get_logger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Training a PEFT model for Semantic Search task") + parser.add_argument("--dataset_name", type=str, default=None, help="dataset name on HF hub") + parser.add_argument( + "--max_length", + type=int, + default=128, + help=( + "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated," + " sequences shorter will be padded if `--pad_to_max_length` is passed." + ), + ) + parser.add_argument( + "--model_name_or_path", + type=str, + help="Path to pretrained model or model identifier from huggingface.co/models.", + required=True, + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") + parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], + ) + parser.add_argument( + "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument( + "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." + ) + parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + action="store_true", + help="Whether to enable experiment trackers for logging.", + ) + parser.add_argument( + "--report_to", + type=str, + default="all", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' + "Only applicable when `--with_tracking` is passed." + ), + ) + parser.add_argument( + "--sanity_test", + action="store_true", + help="Whether to enable sanity test.", + ) + parser.add_argument( + "--use_peft", + action="store_true", + help="Whether to use PEFT.", + ) + args = parser.parse_args() + + if args.push_to_hub: + assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + + return args + + +def save_model_hook(models, weights, output_dir): + for i, model in enumerate(models): + model.save_pretrained(output_dir, state_dict=weights[i]) + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + +def load_model_hook(models, input_dir): + while len(models) > 0: + model = models.pop() + # pop models so that they are not loaded again + if hasattr(model, "active_adapter") and hasattr(model, "load_adapter"): + model.load_adapter(input_dir, model.active_adapter, is_trainable=True) + + +class AutoModelForSentenceEmbedding(nn.Module): + def __init__(self, model_name, tokenizer, normalize=True): + super().__init__() + + self.model = AutoModel.from_pretrained( + model_name + ) # , quantizaton_config=BitsAndBytesConfig(load_in_8bit=True), device_map={"":0}) + self.normalize = normalize + self.tokenizer = tokenizer + + def forward(self, **kwargs): + model_output = self.model(**kwargs) + embeddings = self.mean_pooling(model_output, kwargs["attention_mask"]) + if self.normalize: + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + return embeddings + + def mean_pooling(self, model_output, attention_mask): + token_embeddings = model_output[0] # First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.model, name) + + +def get_cosing_embeddings(query_embs, product_embs): + return torch.sum(query_embs * product_embs, axis=1) + + +def get_loss(cosine_score, labels): + return torch.mean(torch.square(labels * (1 - cosine_score) + torch.clamp((1 - labels) * cosine_score, min=0.0))) + + +def main(): + args = parse_args() + + accelerator_kwargs = {"gradient_accumulation_steps": args.gradient_accumulation_steps} + if args.with_tracking: + accelerator_kwargs["log_with"] = args.report_to + accelerator_kwargs["project_dir"] = args.output_dir + accelerator = Accelerator(**accelerator_kwargs) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + accelerator.wait_for_everyone() + + # get the tokenizer + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) + + # dataset download and preprocessing + if args.sanity_test: + train_dataset = load_dataset("smangrul/amazon_esci", split="train[:1024]") + val_dataset = load_dataset("smangrul/amazon_esci", split="validation[:1024]") + + dataset = DatasetDict({"train": train_dataset, "validation": val_dataset}) + else: + dataset = load_dataset(args.dataset_name, revision="main") + + def preprocess_function(examples): + queries = examples["query"] + result = tokenizer(queries, padding="max_length", max_length=70, truncation=True) + result = {f"query_{k}": v for k, v in result.items()} + + products = examples["product_title"] + result_products = tokenizer(products, padding="max_length", max_length=70, truncation=True) + for k, v in result_products.items(): + result[f"product_{k}"] = v + + result["labels"] = examples["relevance_label"] + return result + + processed_datasets = dataset.map( + preprocess_function, + batched=True, + remove_columns=dataset["train"].column_names, + desc="Running tokenizer on dataset", + ) + + # Log a few random samples from the training set: + for index in random.sample(range(len(processed_datasets["train"])), 3): + logger.info(f"Sample {index} of the training set: {processed_datasets['train'][index]}.") + + # base model + model = AutoModelForSentenceEmbedding(args.model_name_or_path, tokenizer) + + if args.use_peft: + # peft config and wrapping + peft_config = LoraConfig( + r=8, + lora_alpha=16, + bias="none", + task_type=TaskType.FEATURE_EXTRACTION, + target_modules=["key", "query", "value"], + ) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + accelerator.print(model) + + # get dataloaders + train_dataloader = DataLoader( + processed_datasets["train"], + shuffle=True, + collate_fn=default_data_collator, + batch_size=args.per_device_train_batch_size, + pin_memory=True, + ) + + eval_dataloader = DataLoader( + processed_datasets["validation"], + shuffle=False, + collate_fn=default_data_collator, + batch_size=args.per_device_eval_batch_size, + pin_memory=True, + ) + + optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.max_train_steps, + ) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Figure out how many steps we should save the Accelerator states + checkpointing_steps = args.checkpointing_steps + if checkpointing_steps is not None and checkpointing_steps.isdigit(): + checkpointing_steps = int(checkpointing_steps) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if args.with_tracking: + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("peft_semantic_search", experiment_config) + + metric = evaluate.load("roc_auc") + + total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + if args.use_peft: + # saving and loading checkpoints for resuming training + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(processed_datasets['train'])}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) + completed_steps = 0 + starting_epoch = 0 + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") + accelerator.load_state(args.resume_from_checkpoint) + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] + + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch + else: + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) + + for epoch in range(starting_epoch, args.num_train_epochs): + model.train() + total_loss = 0 + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): + with accelerator.accumulate(model): + query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k}) + product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k}) + loss = get_loss(get_cosing_embeddings(query_embs, product_embs), batch["labels"]) + total_loss += accelerator.reduce(loss.detach().float(), reduction="sum") + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + model.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + completed_steps += 1 + + if (step + 1) % 100 == 0: + logger.info(f"Step: {step + 1}, Loss: {total_loss / (step + 1)}") + if args.with_tracking: + accelerator.log({"train/loss": total_loss / (step + 1)}, step=completed_steps) + + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + output_dir = f"step_{completed_steps}" + if args.output_dir is not None: + output_dir = os.path.join(args.output_dir, output_dir) + accelerator.save_state(output_dir) + + if completed_steps >= args.max_train_steps: + break + + model.eval() + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + query_embs = model(**{k.replace("query_", ""): v for k, v in batch.items() if "query" in k}) + product_embs = model(**{k.replace("product_", ""): v for k, v in batch.items() if "product" in k}) + prediction_scores = get_cosing_embeddings(query_embs, product_embs) + prediction_scores, references = accelerator.gather_for_metrics((prediction_scores, batch["labels"])) + metric.add_batch( + prediction_scores=prediction_scores, + references=references, + ) + + result = metric.compute() + result = {f"eval/{k}": v for k, v in result.items()} + # Use accelerator.print to print only on the main process. + accelerator.print(f"epoch {epoch}:", result) + if args.with_tracking: + result["train/epoch_loss"] = total_loss.item() / len(train_dataloader) + accelerator.log(result, step=completed_steps) + + if args.output_dir is not None: + accelerator.wait_for_everyone() + if accelerator.is_main_process: + if isinstance(checkpointing_steps, str): + accelerator.save_state(os.path.join(args.output_dir, f"epoch_{epoch}")) + accelerator.unwrap_model(model).save_pretrained( + args.output_dir, state_dict=accelerator.get_state_dict(accelerator.unwrap_model(model)) + ) + tokenizer.save_pretrained(args.output_dir) + if args.push_to_hub: + commit_message = ( + f"Training in progress epoch {epoch}" + if epoch < args.num_train_epochs - 1 + else "End of training" + ) + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message=commit_message, + run_as_future=True, + ) + accelerator.wait_for_everyone() + accelerator.end_training() + + +if __name__ == "__main__": + main() diff --git a/peft/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb b/peft/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..ded221bbce4417334a5129e6e1deb1cd6859347f --- /dev/null +++ b/peft/examples/feature_extraction/peft_lora_embedding_semantic_similarity_inference.ipynb @@ -0,0 +1,1808 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3e7b6247", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-06-29 09:08:24,868] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", + "\n", + "===================================BUG REPORT===================================\n", + "Welcome to bitsandbytes. For bug reports, please run\n", + "\n", + "python -m bitsandbytes\n", + "\n", + " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "================================================================================\n", + "bin /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so\n", + "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n", + "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n", + "CUDA SETUP: Detected CUDA version 118\n", + "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sourab/miniconda3/envs/ml/lib/python3.11/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so'), PosixPath('/home/sourab/miniconda3/envs/ml/lib/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward.\n", + "Either way, this might cause trouble in the future:\n", + "If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n", + " warn(msg)\n" + ] + } + ], + "source": [ + "import argparse\n", + "import json\n", + "import logging\n", + "import math\n", + "import os\n", + "import random\n", + "from pathlib import Path\n", + "from tqdm import tqdm\n", + "\n", + "import datasets\n", + "from datasets import load_dataset, DatasetDict\n", + "\n", + "import evaluate\n", + "import torch\n", + "from torch import nn\n", + "from torch.utils.data import DataLoader\n", + "\n", + "import transformers\n", + "from transformers import AutoTokenizer, AutoModel, default_data_collator, SchedulerType, get_scheduler\n", + "from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry\n", + "from transformers.utils.versions import require_version\n", + "\n", + "from huggingface_hub import Repository, create_repo\n", + "\n", + "from accelerate import Accelerator\n", + "from accelerate.logging import get_logger\n", + "from accelerate.utils import set_seed\n", + "\n", + "from peft import PeftModel\n", + "\n", + "import hnswlib" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c939b4fd", + "metadata": {}, + "outputs": [], + "source": [ + "class AutoModelForSentenceEmbedding(nn.Module):\n", + " def __init__(self, model_name, tokenizer, normalize=True):\n", + " super(AutoModelForSentenceEmbedding, self).__init__()\n", + "\n", + " self.model = AutoModel.from_pretrained(model_name) # , quantizaton_config=BitsAndBytesConfig(load_in_8bit=True), device_map={\"\":0})\n", + " self.normalize = normalize\n", + " self.tokenizer = tokenizer\n", + "\n", + " def forward(self, **kwargs):\n", + " model_output = self.model(**kwargs)\n", + " embeddings = self.mean_pooling(model_output, kwargs[\"attention_mask\"])\n", + " if self.normalize:\n", + " embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)\n", + "\n", + " return embeddings\n", + "\n", + " def mean_pooling(self, model_output, attention_mask):\n", + " token_embeddings = model_output[0] # First element of model_output contains all token embeddings\n", + " input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n", + " return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n", + "\n", + " def __getattr__(self, name: str):\n", + " \"\"\"Forward missing attributes to the wrapped module.\"\"\"\n", + " try:\n", + " return super().__getattr__(name) # defer to nn.Module's logic\n", + " except AttributeError:\n", + " return getattr(self.model, name)\n", + "\n", + "\n", + "def get_cosing_embeddings(query_embs, product_embs):\n", + " return torch.sum(query_embs * product_embs, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8b5d9256", + "metadata": {}, + "outputs": [], + "source": [ + "model_name_or_path = \"intfloat/e5-large-v2\"\n", + "peft_model_id = \"smangrul/peft_lora_e5_semantic_search\"\n", + "dataset_name = \"smangrul/amazon_esci\"\n", + "max_length = 70\n", + "batch_size = 256" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f190e1ee", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/raid/sourab/.cache/huggingface/datasets/smangrul___parquet/smangrul--amazon_esci-321288cabf0cc045/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "43b84641575e4ce6899a3e6f61d7e126", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexproduct_title
00RamPro 10\" All Purpose Utility Air Tires/Wheel...
11MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...
22NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...
332PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...
44(Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...
.........
476273476273Chanel No.5 Eau Premiere Spray 50ml/1.7oz
476274476274Steve Madden Designer 15 Inch Carry on Suitcas...
476275476275CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce
476276476276Coco Mademoiselle by Chanel for Women - 3.4 oz...
476277476277Chânél No. 5 by Chânél Eau De Parfum Premiere ...
\n", + "

476278 rows × 2 columns

\n", + "" + ], + "text/plain": [ + " index product_title\n", + "0 0 RamPro 10\" All Purpose Utility Air Tires/Wheel...\n", + "1 1 MaxAuto 2-Pack 13x5.00-6 2PLY Turf Mower Tract...\n", + "2 2 NEIKO 20601A 14.5 inch Steel Tire Spoon Lever ...\n", + "3 3 2PK 13x5.00-6 13x5.00x6 13x5x6 13x5-6 2PLY Tur...\n", + "4 4 (Set of 2) 15x6.00-6 Husqvarna/Poulan Tire Whe...\n", + "... ... ...\n", + "476273 476273 Chanel No.5 Eau Premiere Spray 50ml/1.7oz\n", + "476274 476274 Steve Madden Designer 15 Inch Carry on Suitcas...\n", + "476275 476275 CHANEL Le Lift Creme Yeux, Black, 0.5 Ounce\n", + "476276 476276 Coco Mademoiselle by Chanel for Women - 3.4 oz...\n", + "476277 476277 Chânél No. 5 by Chânél Eau De Parfum Premiere ...\n", + "\n", + "[476278 rows x 2 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product_dataset_for_indexing" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "85840ec6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexproduct_title
3471034710ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3
277590277590WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack)
474000474000iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights
1899718997USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More
208666208666AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold
326614326614CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids
105637105637Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion
342392342392chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve)
319970319970AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256)
416956416956Timberland HIKER-ROUND 54 BROWN
\n", + "
" + ], + "text/plain": [ + " index \\\n", + "34710 34710 \n", + "277590 277590 \n", + "474000 474000 \n", + "18997 18997 \n", + "208666 208666 \n", + "326614 326614 \n", + "105637 105637 \n", + "342392 342392 \n", + "319970 319970 \n", + "416956 416956 \n", + "\n", + " product_title \n", + "34710 ROK 4-1/2 inch Diamond Saw Blade Set, Pack of 3 \n", + "277590 WSGG Medical Goggles, FDA registered, Safety Goggles, Fit Over Glasses, Anti-Fog, Anti-Splash (1 pack) \n", + "474000 iJDMTOY 15W CREE High Power LED Angel Eye Bulbs Compatible With BMW 5 6 7 Series X3 X5 (E39 E60 E63 E65 E53), 7000K Xenon White Headlight Ring Marker Lights \n", + "18997 USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More \n", + "208666 AOGGY Compatible with MacBook Air 13 inch Case A1466/A1369 (2010-2017 Release) Glitter Fluorescent Color Plastic Hard Case, with Older Version MacBook Air 13 inch Keyboard Cover - Gold \n", + "326614 CUTE STONE Little Kitchen Playset, Kitchen Toy Set with Realistic Sound &Light, Play Sink, Cooking Stove with Steam, Play Food and Kitchen Accessories, Great Kitchen Toys for Toddlers Kids \n", + "105637 Milwaukee Electric Tool 2470-21 M12 Cordless Shear Kit, 12 V, Li-Ion \n", + "342392 chouyatou Women's Short Sleeve/Strap Open Bust Bodysuit Shapewear Firm Control Body Shaper (X-Small, Nude Sleeve) \n", + "319970 AMT 256 Hz Medical-Grade Tuning Fork Instrument with Fixed Weights, Non-Magnetic Aluminum Alloy (C 256) \n", + "416956 Timberland HIKER-ROUND 54 BROWN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option(\"max_colwidth\", 300)\n", + "product_dataset_for_indexing.sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "408b6e00", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Running tokenizer on dataset: 0%| | 0/476278 [00:00 k\n", + "\n", + " # Query dataset, k - number of the closest elements (returns 2 numpy arrays)\n", + " labels, distances = search_index.knn_query(query_embeddings, k=k)\n", + "\n", + " return [\n", + " (ids_to_products_dict[label], (1 - distance))\n", + " for label, distance in zip(labels[0], distances[0])\n", + " if (1 - distance) >= threshold\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "1c47f12d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='NLP and ML books'\n", + "cosine_sim_score=0.92 product='Machine Learning: A Journey from Beginner to Advanced Including Deep Learning, Scikit-learn and Tensorflow'\n", + "cosine_sim_score=0.91 product='Mastering Machine Learning with scikit-learn'\n", + "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", + "cosine_sim_score=0.91 product='Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow: Concepts, Tools, and Techniques to Build Intelligent Systems'\n", + "cosine_sim_score=0.91 product='Practical Deep Learning: A Python-Based Introduction'\n", + "cosine_sim_score=0.9 product='Machine Learning: A Hands-On, Project-Based Introduction to Machine Learning for Absolute Beginners: Mastering Engineering ML Systems using Scikit-Learn and TensorFlow'\n", + "cosine_sim_score=0.9 product='Mastering Machine Learning with scikit-learn - Second Edition: Apply effective learning algorithms to real-world problems using scikit-learn'\n", + "cosine_sim_score=0.9 product='Mastering Machine Learning on AWS: Advanced machine learning in Python using SageMaker, Apache Spark, and TensorFlow'\n", + "cosine_sim_score=0.9 product='Machine Learning Algorithms: Naive Bayes'\n", + "cosine_sim_score=0.9 product='Fundamentals of Machine Learning for Predictive Data Anayltics: Algorithms, Worked Examples, and Case Studies'\n" + ] + } + ], + "source": [ + "query = \"NLP and ML books\"\n", + "k = 10\n", + "query_embeddings = get_query_embeddings(query, model, tokenizer, device)\n", + "search_results = get_nearest_neighbours(k, product_search_index, query_embeddings, ids_to_products_dict, threshold=0.7)\n", + "\n", + "print(f\"{query=}\")\n", + "for product, cosine_sim_score in search_results:\n", + " print(f\"cosine_sim_score={round(cosine_sim_score,2)} {product=}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9e2dd2c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/feature_extraction/requirements.txt b/peft/examples/feature_extraction/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bb3bc04de57a3b75cdb5f5f856c40daed0fcdf9 --- /dev/null +++ b/peft/examples/feature_extraction/requirements.txt @@ -0,0 +1,10 @@ +peft +accelerate +transformers +datasets==2.18.0 +evaluate +hnswlib +pandas +tqdm +huggingface_hub +wandb \ No newline at end of file diff --git a/peft/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py b/peft/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py new file mode 100644 index 0000000000000000000000000000000000000000..6773e909c329b2e14f2f6166d2494b21d0dee9d9 --- /dev/null +++ b/peft/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py @@ -0,0 +1,195 @@ +import os + +import torch +import torch.nn as nn +import transformers +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +from peft import LoraConfig, get_peft_model + + +os.environ["CUDA_VISIBLE_DEVICES"] = "0" # force to use CUDA GPU device 0 +os.environ["ZE_AFFINITY_MASK"] = "0" # force to use Intel XPU device 0 +# -*- coding: utf-8 -*- +"""Finetune-opt-bnb-peft.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o + +## Fine-tune large models using 🤗 `peft` adapters, `transformers` & `bitsandbytes` + +In this tutorial we will cover how we can fine-tune large language models using the very recent `peft` library and `bitsandbytes` for loading large models in 8-bit. +The fine-tuning method will rely on a recent method called "Low Rank Adapters" (LoRA), instead of fine-tuning the entire model you just have to fine-tune these adapters and load them properly inside the model. +After fine-tuning the model you can also share your adapters on the 🤗 Hub and load them very easily. Let's get started! + +### Install requirements + +First, run the cells below to install the requirements: +""" + + +"""### Model loading + +Here let's load the `opt-6.7b` model, its weights in half-precision (float16) are about 13GB on the Hub! If we load them in 8-bit we would require around 7GB of memory instead. +""" + +device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +device_module = getattr(torch, device_type, torch.cuda) +free_in_GB = int(device_module.mem_get_info()[0] / 1024**3) +max_memory = f"{free_in_GB - 2}GB" + +n_gpus = device_module.device_count() +max_memory = {i: max_memory for i in range(n_gpus)} + +model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + max_memory=max_memory, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + torch_dtype=torch.float16, +) + +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") + +"""### Post-processing on the model + +Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons. +""" + +print(model) + +for param in model.parameters(): + param.requires_grad = False # freeze the model - train adapters later + if param.ndim == 1: + # cast the small parameters (e.g. layernorm) to fp32 for stability + param.data = param.data.to(torch.float32) + +# model.gradient_checkpointing_enable() # reduce number of stored activations +# model.model.decoder.project_in = lambda x: x.requires_grad_(True) + + +class CastOutputToFloat(nn.Sequential): + def forward(self, x): + return super().forward(x).to(torch.float32) + + +model.lm_head = CastOutputToFloat(model.lm_head) + +"""### Apply LoRA + +Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`. +""" + + +def print_trainable_parameters(model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + print( + f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" + ) + + +config = LoraConfig( + r=64, + lora_alpha=32, + target_modules=["q_proj", "v_proj", "out_proj", "fc1", "fc2"], + lora_dropout=0.01, + bias="none", + task_type="CAUSAL_LM", +) + +model = get_peft_model(model, config) +print_trainable_parameters(model) + +# Verifying the datatypes. +dtypes = {} +for _, p in model.named_parameters(): + dtype = p.dtype + if dtype not in dtypes: + dtypes[dtype] = 0 + dtypes[dtype] += p.numel() +total = 0 +for k, v in dtypes.items(): + total += v +for k, v in dtypes.items(): + print(k, v, v / total) + +"""### Training""" + +data = load_dataset("Abirate/english_quotes") +data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + +trainer = transformers.Trainer( + model=model, + train_dataset=data["train"], + args=transformers.TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=10, + max_steps=20, + learning_rate=3e-4, + fp16=True, + logging_steps=1, + output_dir="outputs", + ), + data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), +) +model.config.use_cache = False # silence the warnings. Please re-enable for inference! +trainer.train() + +# from huggingface_hub import notebook_login + +# notebook_login() + +# model.push_to_hub("ybelkada/opt-6.7b-lora", use_auth_token=True) + +"""## Load adapters from the Hub + +You can also directly load adapters from the Hub using the commands below: +""" + +# import torch +# from peft import PeftModel, PeftConfig +# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +# +# peft_model_id = "ybelkada/opt-6.7b-lora" +# config = PeftConfig.from_pretrained(peft_model_id) +# model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map='auto') +# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) +# +## Load the Lora model +# model = PeftModel.from_pretrained(model, peft_model_id) +# +# """## Inference +# +# You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference as you would do it usually in `transformers`. +# """ +# +batch = tokenizer("Two things are infinite: ", return_tensors="pt").to(model.device) + +model.config.use_cache = False # silence the warnings. Please re-enable for inference! +model.eval() + +with torch.amp.autocast(device_type=device_type): + output_tokens = model.generate(**batch, max_new_tokens=50) + +print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=True)) +# model.save('./test.pt') + +# """As you can see by fine-tuning for few steps we have almost recovered the quote from Albert Einstein that is present in the [training data](https://huggingface.co/datasets/Abirate/english_quotes).""" diff --git a/peft/examples/hra_dreambooth/README.md b/peft/examples/hra_dreambooth/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1c93b7f1c9f496cec330bcab04e593df6a71cbf2 --- /dev/null +++ b/peft/examples/hra_dreambooth/README.md @@ -0,0 +1,98 @@ + + +# DreamBooth fine-tuning with HRA + +This guide demonstrates how to use Householder reflection adaptation (HRA) method, to fine-tune Dreambooth with `stabilityai/stable-diffusion-2-1` model. + +HRA provides a new perspective connecting LoRA to OFT and achieves encouraging performance in various downstream tasks. +HRA adapts a pre-trained model by multiplying each frozen weight matrix with a chain of r learnable Householder reflections (HRs). +HRA can be interpreted as either an OFT adapter or an adaptive LoRA. +Consequently, it harnesses the advantages of both strategies, reducing parameters and computation costs while penalizing the loss of pre-training knowledge. +For further details on HRA, please consult the [original HRA paper](https://huggingface.co/papers/2405.17484). + +In this guide we provide a Dreambooth fine-tuning script that is available in [PEFT's GitHub repo examples](https://github.com/huggingface/peft/tree/main/examples/hra_dreambooth). This implementation is adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth). + +You can try it out and fine-tune on your custom images. + +## Set up your environment + +Start by cloning the PEFT repository: + +```bash +git clone --recursive https://github.com/huggingface/peft +``` + +Navigate to the directory containing the training scripts for fine-tuning Dreambooth with HRA: + +```bash +cd peft/examples/hra_dreambooth +``` + +Set up your environment: install PEFT, and all the required libraries. At the time of writing this guide we recommend installing PEFT from source. The following environment setup should work on A100 and H100: + +```bash +conda create --name peft python=3.10 +conda activate peft +conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia +conda install xformers -c xformers +pip install -r requirements.txt +pip install git+https://github.com/huggingface/peft +``` + +## Download the data + +[dreambooth](https://github.com/google/dreambooth) dataset should have been automatically cloned in the following structure when running the training script. + +``` +hra_dreambooth +├── data +│ └── dreambooth +│ └── dataset +│ ├── backpack +│ └── backpack_dog +│ ... +``` + +You can also put your custom images into `hra_dreambooth/data/dreambooth/dataset`. + +## Fine-tune Dreambooth with HRA + +```bash +class_idx=0 +bash ./train_dreambooth.sh $class_idx +``` + +where the `$class_idx` corresponds to different subjects ranging from 0 to 29. + +Launch the training script with `accelerate` and pass hyperparameters, as well as LoRa-specific arguments to it such as: + +- `use_hra`: Enables HRA in the training script. +- `hra_r`: the number of HRs (i.e., r) across different layers, expressed in `int`. +As r increases, the number of trainable parameters increases, which generally leads to improved performance. +However, this also results in higher memory consumption and longer computation times. +Therefore, r is usually set to 8. +**Note**, please set r to an even number to avoid potential issues during initialization. +- `hra_apply_GS`: Applies Gram-Schmidt orthogonalization. Default is `false`. +- `hra_bias`: specify if the `bias` parameters should be trained. Can be `none`, `all` or `hra_only`. + +If you are running this script on Windows, you may need to set the `--num_dataloader_workers` to 0. + +To learn more about DreamBooth fine-tuning with prior-preserving loss, check out the [Diffusers documentation](https://huggingface.co/docs/diffusers/training/dreambooth#finetuning-with-priorpreserving-loss). + +## Generate images with the fine-tuned model + +To generate images with the fine-tuned model, simply run the jupyter notebook `dreambooth_inference.ipynb` for visualization with `jupyter notebook` under `./examples/hra_dreambooth`. diff --git a/peft/examples/hra_dreambooth/a_purple_qwe_backpack.png b/peft/examples/hra_dreambooth/a_purple_qwe_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..7ccda8db974ffe842d0f520df3e852648fde2eea --- /dev/null +++ b/peft/examples/hra_dreambooth/a_purple_qwe_backpack.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3854fc16938a9c0b77ff7172e9dbbbd03c2ff137f9e5c277d0716218f7638411 +size 477707 diff --git a/peft/examples/hra_dreambooth/dreambooth_inference.ipynb b/peft/examples/hra_dreambooth/dreambooth_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fb80b1cb2f8e4e7d5d37fddd148408c0c09e4e7b --- /dev/null +++ b/peft/examples/hra_dreambooth/dreambooth_inference.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "id": "acab479f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from PIL import Image\n", + "\n", + "import torch\n", + "from accelerate.logging import get_logger\n", + "from diffusers import StableDiffusionPipeline\n", + "from diffusers.utils import check_min_version\n", + "\n", + "from peft import PeftModel\n", + "\n", + "\n", + "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n", + "check_min_version(\"0.10.0.dev0\")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "MODEL_NAME = \"stabilityai/stable-diffusion-2-1\"\n", + "\n", + "PEFT_TYPE=\"hra\"\n", + "HRA_R=8\n", + "SELECTED_SUBJECT=\"backpack\"\n", + "EPOCH_IDX = 1000\n", + "\n", + "PROJECT_NAME=f\"dreambooth_{PEFT_TYPE}\"\n", + "RUN_NAME=f\"{SELECTED_SUBJECT}_{PEFT_TYPE}_{HRA_R}\"\n", + "OUTPUT_DIR=f\"./data/output/{PEFT_TYPE}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "06cfd506", + "metadata": {}, + "outputs": [], + "source": [ + "def get_hra_sd_pipeline(\n", + " ckpt_dir, base_model_name_or_path=None, epoch=int, dtype=torch.float32, device=\"cuda\", adapter_name=\"default\"\n", + "):\n", + "\n", + " if base_model_name_or_path is None:\n", + " raise ValueError(\"Please specify the base model name or path\")\n", + "\n", + " pipe = StableDiffusionPipeline.from_pretrained(\n", + " base_model_name_or_path, torch_dtype=dtype, requires_safety_checker=False\n", + " ).to(device)\n", + " \n", + " load_adapter(pipe, ckpt_dir, epoch, adapter_name)\n", + "\n", + " if dtype in (torch.float16, torch.bfloat16):\n", + " pipe.unet.half()\n", + " pipe.text_encoder.half()\n", + "\n", + " pipe.to(device)\n", + " return pipe\n", + "\n", + "\n", + "def load_adapter(pipe, ckpt_dir, epoch, adapter_name=\"default\"):\n", + " \n", + " unet_sub_dir = os.path.join(ckpt_dir, f\"unet/{epoch}\", adapter_name)\n", + " text_encoder_sub_dir = os.path.join(ckpt_dir, f\"text_encoder/{epoch}\", adapter_name)\n", + " \n", + " if isinstance(pipe.unet, PeftModel):\n", + " pipe.unet.load_adapter(unet_sub_dir, adapter_name=adapter_name)\n", + " else:\n", + " pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)\n", + " \n", + " if os.path.exists(text_encoder_sub_dir):\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.load_adapter(text_encoder_sub_dir, adapter_name=adapter_name)\n", + " else:\n", + " pipe.text_encoder = PeftModel.from_pretrained(pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name)\n", + " \n", + "\n", + "def set_adapter(pipe, adapter_name):\n", + " pipe.unet.set_adapter(adapter_name)\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.set_adapter(adapter_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98a0d8ac", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = \"a purple qwe backpack.\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e888d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00, 14.47it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.72 s, sys: 495 ms, total: 2.22 s\n", + "Wall time: 2.28 s\n" + ] + } + ], + "source": [ + "%%time\n", + "pipe = get_hra_sd_pipeline(OUTPUT_DIR, MODEL_NAME, EPOCH_IDX, adapter_name=RUN_NAME, device=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "f1c1a1c0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/50 [00:00" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "60fa38d2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This is an example.\n", + "example_image = Image.open(\"./a_purple_qwe_backpack.png\")\n", + "example_image" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/hra_dreambooth/requirements.txt b/peft/examples/hra_dreambooth/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3ab91a6f01e839f36bbab8c65f2050544991910 --- /dev/null +++ b/peft/examples/hra_dreambooth/requirements.txt @@ -0,0 +1,12 @@ +transformers==4.55.0 +accelerate==1.9.0 +evaluate +tqdm +datasets==4.0.0 +diffusers==0.34.0 +Pillow +huggingface_hub +safetensors +ipykernel +ipywidgets +wandb==0.21.0 \ No newline at end of file diff --git a/peft/examples/hra_dreambooth/train_dreambooth.py b/peft/examples/hra_dreambooth/train_dreambooth.py new file mode 100644 index 0000000000000000000000000000000000000000..4ec885af5435c0f893498181d53eb0674e402a62 --- /dev/null +++ b/peft/examples/hra_dreambooth/train_dreambooth.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Bridging The Gap between Low-rank and Orthogonal +# Adaptation via Householder Reflection Adaptation" (https://huggingface.co/papers/2405.17484). + +import hashlib +import itertools +import logging +import math +import os +from contextlib import nullcontext +from pathlib import Path + +import datasets +import diffusers +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration, set_seed +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import Repository +from tqdm.auto import tqdm +from transformers import AutoTokenizer +from utils.args_loader import ( + get_full_repo_name, + import_model_class_from_model_name_or_path, + parse_args, +) +from utils.dataset import DreamBoothDataset, PromptDataset, collate_fn +from utils.tracemalloc import TorchTracemalloc, b2mb + +from peft import HRAConfig, get_peft_model + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.16.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = ["to_q", "to_v", "to_k", "query", "value", "key", "to_out.0", "add_k_proj", "add_v_proj"] +TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"] + + +def save_adaptor(accelerator, step, unet, text_encoder, args): + unwarpped_unet = accelerator.unwrap_model(unet) + unwarpped_unet.save_pretrained( + os.path.join(args.output_dir, f"unet/{step}"), state_dict=accelerator.get_state_dict(unet) + ) + if args.train_text_encoder: + unwarpped_text_encoder = accelerator.unwrap_model(text_encoder) + unwarpped_text_encoder.save_pretrained( + os.path.join(args.output_dir, f"text_encoder/{step}"), + state_dict=accelerator.get_state_dict(text_encoder), + ) + + +def main(args): + validation_prompts = list(filter(None, args.validation_prompt[0].split("."))) + + logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to if args.report_to != "none" else None, + project_dir=accelerator_project_config, + ) + if args.report_to == "wandb": + import wandb + + args.wandb_project_name = args.project_name + args.wandb_run_name = args.run_name + wandb_init = { + "wandb": { + "name": args.wandb_run_name, + "mode": "online", + } + } + + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + global_seed = hash(args.run_name) % (2**32) + set_seed(global_seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + repo = Repository(args.output_dir, clone_from=repo_name) # noqa: F841 + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + if args.use_hra: + config = HRAConfig( + r=args.hra_r, + apply_GS=args.hra_apply_GS, + target_modules=UNET_TARGET_MODULES, + bias=args.hra_bias, + ) + unet = get_peft_model(unet, config, adapter_name=args.run_name) + unet.print_trainable_parameters() + + vae.requires_grad_(False) + unet.train() + + if args.train_text_encoder and args.use_hra: + config = HRAConfig( + r=args.hra_r, + apply_GS=args.hra_apply_GS, + target_modules=UNET_TARGET_MODULES, + bias=args.hra_bias, + ) + text_encoder = get_peft_model(text_encoder, config, adapter_name=args.run_name) + text_encoder.print_trainable_parameters() + text_encoder.train() + else: + text_encoder.requires_grad_(False) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move unet, vae and text_encoder to device and cast to weight_dtype + unet.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=weight_dtype) + text_encoder.to(accelerator.device, dtype=weight_dtype) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warning("XPU hasn't support xformers yet, ignore it.") + elif is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + # below fails when using hra so commenting it out + if args.train_text_encoder and not args.use_hra: + text_encoder.gradient_checkpointing_enable() + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32: + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = [param for param in unet.parameters() if param.requires_grad] + + if args.train_text_encoder: + params_to_optimize += [param for param in text_encoder.parameters() if param.requires_grad] + + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Download the official dreambooth dataset from the official repository: https://github.com/google/dreambooth.git + data_path = os.path.join(os.getcwd(), "data", "dreambooth") + if not os.path.exists(data_path): + os.makedirs(os.path.join(os.getcwd(), "data"), exist_ok=True) + os.system(f"git clone https://github.com/google/dreambooth.git '{data_path}'") + + # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.num_dataloader_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + if args.report_to == "wandb": + accelerator.init_trackers(args.wandb_project_name, config=vars(args), init_kwargs=wandb_init) + else: + accelerator.init_trackers(args.project_name, config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = resume_global_step // num_update_steps_per_epoch + resume_step = resume_global_step % num_update_steps_per_epoch + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + if args.train_text_encoder: + text_encoder.train() + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + + with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * vae.config.scaling_factor + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + if global_step % args.checkpointing_steps == 0 and global_step != 0: + if accelerator.is_main_process: + save_adaptor(accelerator, global_step, unet, text_encoder, args) + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if ( + args.validation_prompt is not None + and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0 + and global_step > 10 + ): + unet.eval() + + logger.info( + f"Running validation... \n Generating {len(validation_prompts)} images with prompt:" + f" {validation_prompts[0]}, ......" + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + revision=args.revision, + ) + # set `keep_fp32_wrapper` to True because we do not want to remove + # mixed precision hooks while we are still training + pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True) + pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + if args.seed is not None: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + else: + generator = None + + images = [] + val_img_dir = os.path.join( + args.output_dir, + f"validation/{global_step}", + args.run_name, + ) + os.makedirs(val_img_dir, exist_ok=True) + + for val_promot in validation_prompts: + image = pipeline(val_promot, num_inference_steps=50, generator=generator).images[0] + image.save(os.path.join(val_img_dir, f"{'_'.join(val_promot.split(' '))}.png"[1:])) + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + import wandb + + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {validation_prompts[i]}") + for i, image in enumerate(images) + ] + } + ) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + if global_step >= args.max_train_steps: + break + + # Printing the device memory usage details such as allocated memory, peak memory, and total memory usage + if not args.no_tracemalloc: + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + if args.push_to_hub: + repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True) + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/hra_dreambooth/train_dreambooth.sh b/peft/examples/hra_dreambooth/train_dreambooth.sh new file mode 100644 index 0000000000000000000000000000000000000000..c45915b4033bee36a1e7e46c855736c18499e2d4 --- /dev/null +++ b/peft/examples/hra_dreambooth/train_dreambooth.sh @@ -0,0 +1,185 @@ + +CLASS_IDX=$1 + +# Define the UNIQUE_TOKEN, CLASS_TOKENs, and SUBJECT_NAMES +UNIQUE_TOKEN="qwe" + +SUBJECT_NAMES=( + "backpack" "backpack_dog" "bear_plushie" "berry_bowl" "can" + "candle" "cat" "cat2" "clock" "colorful_sneaker" + "dog" "dog2" "dog3" "dog5" "dog6" + "dog7" "dog8" "duck_toy" "fancy_boot" "grey_sloth_plushie" + "monster_toy" "pink_sunglasses" "poop_emoji" "rc_car" "red_cartoon" + "robot_toy" "shiny_sneaker" "teapot" "vase" "wolf_plushie" +) + +CLASS_TOKENs=( + "backpack" "backpack" "stuffed animal" "bowl" "can" + "candle" "cat" "cat" "clock" "sneaker" + "dog" "dog" "dog" "dog" "dog" + "dog" "dog" "toy" "boot" "stuffed animal" + "toy" "glasses" "toy" "toy" "cartoon" + "toy" "sneaker" "teapot" "vase" "stuffed animal" +) + +CLASS_TOKEN=${CLASS_TOKENs[$CLASS_IDX]} +SELECTED_SUBJECT=${SUBJECT_NAMES[$CLASS_IDX]} + +if [[ $CLASS_IDX =~ ^(0|1|2|3|4|5|8|9|17|18|19|20|21|22|23|24|25|26|27|28|29)$ ]]; then + PROMPT_LIST=( + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a wheat field in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a tree and autumn leaves in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with the Eiffel Tower in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating on top of water." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} floating in an ocean of milk." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of green grass with sunflowers around it." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a mirror." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of the sidewalk in a crowded street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a dirt road." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a white rug." + "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + ) + + prompt_test_list=( + "a ${CLASS_TOKEN} in the jungle" + "a ${CLASS_TOKEN} in the snow" + "a ${CLASS_TOKEN} on the beach" + "a ${CLASS_TOKEN} on a cobblestone street" + "a ${CLASS_TOKEN} on top of pink fabric" + "a ${CLASS_TOKEN} on top of a wooden floor" + "a ${CLASS_TOKEN} with a city in the background" + "a ${CLASS_TOKEN} with a mountain in the background" + "a ${CLASS_TOKEN} with a blue house in the background" + "a ${CLASS_TOKEN} on top of a purple rug in a forest" + "a ${CLASS_TOKEN} with a wheat field in the background" + "a ${CLASS_TOKEN} with a tree and autumn leaves in the background" + "a ${CLASS_TOKEN} with the Eiffel Tower in the background" + "a ${CLASS_TOKEN} floating on top of water" + "a ${CLASS_TOKEN} floating in an ocean of milk" + "a ${CLASS_TOKEN} on top of green grass with sunflowers around it" + "a ${CLASS_TOKEN} on top of a mirror" + "a ${CLASS_TOKEN} on top of the sidewalk in a crowded street" + "a ${CLASS_TOKEN} on top of a dirt road" + "a ${CLASS_TOKEN} on top of a white rug" + "a red ${CLASS_TOKEN}" + "a purple ${CLASS_TOKEN}" + "a shiny ${CLASS_TOKEN}" + "a wet ${CLASS_TOKEN}" + "a cube shaped ${CLASS_TOKEN}" + ) + +else + PROMPT_LIST=( + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the jungle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in the snow." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on the beach." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on a cobblestone street." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of pink fabric." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a wooden floor." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a city in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a mountain in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} with a blue house in the background." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} on top of a purple rug in a forest." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a red hat." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a santa hat." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a rainbow scarf." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a black top hat and a monocle." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a chef outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a firefighter outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a police outfit." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing pink glasses." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} wearing a yellow shirt." + "a ${UNIQUE_TOKEN} ${CLASS_TOKEN} in a purple wizard outfit." + "a red ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a purple ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a shiny ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a wet ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + "a cube shaped ${UNIQUE_TOKEN} ${CLASS_TOKEN}." + ) + + prompt_test_list=( + "a ${CLASS_TOKEN} in the jungle" + "a ${CLASS_TOKEN} in the snow" + "a ${CLASS_TOKEN} on the beach" + "a ${CLASS_TOKEN} on a cobblestone street" + "a ${CLASS_TOKEN} on top of pink fabric" + "a ${CLASS_TOKEN} on top of a wooden floor" + "a ${CLASS_TOKEN} with a city in the background" + "a ${CLASS_TOKEN} with a mountain in the background" + "a ${CLASS_TOKEN} with a blue house in the background" + "a ${CLASS_TOKEN} on top of a purple rug in a forest" + "a ${CLASS_TOKEN} wearing a red hat" + "a ${CLASS_TOKEN} wearing a santa hat" + "a ${CLASS_TOKEN} wearing a rainbow scarf" + "a ${CLASS_TOKEN} wearing a black top hat and a monocle" + "a ${CLASS_TOKEN} in a chef outfit" + "a ${CLASS_TOKEN} in a firefighter outfit" + "a ${CLASS_TOKEN} in a police outfit" + "a ${CLASS_TOKEN} wearing pink glasses" + "a ${CLASS_TOKEN} wearing a yellow shirt" + "a ${CLASS_TOKEN} in a purple wizard outfit" + "a red ${CLASS_TOKEN}" + "a purple ${CLASS_TOKEN}" + "a shiny ${CLASS_TOKEN}" + "a wet ${CLASS_TOKEN}" + "a cube shaped ${CLASS_TOKEN}" + ) +fi + +VALIDATION_PROMPT=${PROMPT_LIST[@]} +INSTANCE_PROMPT="a photo of ${UNIQUE_TOKEN} ${CLASS_TOKEN}" +CLASS_PROMPT="a photo of ${CLASS_TOKEN}" + +export MODEL_NAME="stabilityai/stable-diffusion-2-1" + +PEFT_TYPE="hra" +HRA_R=8 + +export PROJECT_NAME="dreambooth_${PEFT_TYPE}" +export RUN_NAME="${SELECTED_SUBJECT}_${PEFT_TYPE}_${HRA_R}" +export INSTANCE_DIR="./data/dreambooth/dataset/${SELECTED_SUBJECT}" +export CLASS_DIR="./data/class_data/${CLASS_TOKEN}" +export OUTPUT_DIR="./data/output/${PEFT_TYPE}" + + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir="$CLASS_DIR" \ + --output_dir=$OUTPUT_DIR \ + --project_name=$PROJECT_NAME \ + --run_name=$RUN_NAME \ + --with_prior_preservation \ + --prior_loss_weight=1.0 \ + --instance_prompt="$INSTANCE_PROMPT" \ + --validation_prompt="$VALIDATION_PROMPT" \ + --class_prompt="$CLASS_PROMPT" \ + --resolution=512 \ + --train_batch_size=1 \ + --num_dataloader_workers=2 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --use_hra \ + --hra_r=$HRA_R \ + --hra_bias="hra_only" \ + --learning_rate=5e-3 \ + --max_train_steps=510 \ + --checkpointing_steps=200 \ + --validation_steps=200 \ + --enable_xformers_memory_efficient_attention \ + --report_to="none" \ \ No newline at end of file diff --git a/peft/examples/hra_dreambooth/utils/__init__.py b/peft/examples/hra_dreambooth/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/examples/hra_dreambooth/utils/args_loader.py b/peft/examples/hra_dreambooth/utils/args_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..83d03d68e3fd172099af9fb5d6b825b6d8b3bf53 --- /dev/null +++ b/peft/examples/hra_dreambooth/utils/args_loader.py @@ -0,0 +1,377 @@ +# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth) + +import argparse +import os +import warnings +from typing import Optional + +from huggingface_hub import HfFolder, whoami +from transformers import PretrainedConfig + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a Dreambooth training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--validation_prompt", + nargs="+", + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=500, + help=( + "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + + # hra args + parser.add_argument("--use_hra", action="store_true", help="Whether to use HRA for parameter efficient tuning.") + parser.add_argument("--hra_r", type=int, default=8, help="The rank of HRA across different layers.") + parser.add_argument( + "--hra_apply_GS", default=False, action="store_true", help="Whether to apply Gram-Schmidt orthogonalization." + ) + parser.add_argument( + "--hra_bias", + type=str, + default="none", + help="Bias type for HRA. Can be 'none', 'all' or 'hra_only', only used if use_hra is True.", + ) + parser.add_argument( + "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader." + ) + parser.add_argument( + "--no_tracemalloc", + default=False, + action="store_true", + help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.", + ) + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--project_name", + type=str, + default=None, + help=("The project name for log tracking"), + ) + parser.add_argument( + "--run_name", + type=str, + default=None, + help=("The run name for log tracking"), + ) + parser.add_argument( + "--report_to", + type=str, + default="wandb", + help=( + 'The integration to report the results and logs to. Supported platforms are `"wandb"`' + ' (default), `"tensorboard"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--wandb_run_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU. Default to fp16 if a GPU is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + # Sanity checks + # if args.dataset_name is None and args.train_data_dir is None: + # raise ValueError("Need either a dataset name or a training folder.") + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args diff --git a/peft/examples/hra_dreambooth/utils/dataset.py b/peft/examples/hra_dreambooth/utils/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8adb0976ff747123b686a0d5f12176a9584b1419 --- /dev/null +++ b/peft/examples/hra_dreambooth/utils/dataset.py @@ -0,0 +1,128 @@ +# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth) + +from pathlib import Path + +import torch +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def collate_fn(examples, with_prior_preservation=False): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example diff --git a/peft/examples/hra_dreambooth/utils/tracemalloc.py b/peft/examples/hra_dreambooth/utils/tracemalloc.py new file mode 100644 index 0000000000000000000000000000000000000000..c47ef31e9e956b63fec5dd819ef63edcef50430d --- /dev/null +++ b/peft/examples/hra_dreambooth/utils/tracemalloc.py @@ -0,0 +1,62 @@ +# adapted from [peft's boft_dreambooth](https://github.com/huggingface/peft/tree/main/examples/boft_dreambooth) + +import gc +import threading + +import psutil +import torch + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + gc.collect() + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") diff --git a/peft/examples/image_classification/README.md b/peft/examples/image_classification/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ebc5e0e9b25471b0c30d750646bfeb6d21d6ff0 --- /dev/null +++ b/peft/examples/image_classification/README.md @@ -0,0 +1,15 @@ +# Fine-tuning for image classification using LoRA and 🤗 PEFT + +## Vision Transformer model from transformers + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/image_classification/image_classification_peft_lora.ipynb) + +We provide a notebook (`image_classification_peft_lora.ipynb`) where we learn how to use [LoRA](https://huggingface.co/papers/2106.09685) from 🤗 PEFT to fine-tune an image classification model by ONLY using **0.7%** of the original trainable parameters of the model. + +LoRA adds low-rank "update matrices" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://huggingface.co/papers/2106.09685). + +## PoolFormer model from timm + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/image_classification/image_classification_timm_peft_lora.ipynb) + +The notebook `image_classification_timm_peft_lora.ipynb` showcases fine-tuning an image classification model using from the [timm](https://huggingface.co/docs/timm/index) library. Again, LoRA is used to reduce the numberof trainable parameters to a fraction of the total. diff --git a/peft/examples/image_classification/image_classification_peft_lora.ipynb b/peft/examples/image_classification/image_classification_peft_lora.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..02e97d9bcead1053c7359b0fbe5c2f49a3c1e5e3 --- /dev/null +++ b/peft/examples/image_classification/image_classification_peft_lora.ipynb @@ -0,0 +1,14951 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "71GTxOD71mEn" + }, + "source": [ + "## Introduction\n", + "\n", + "In this notebook, we will learn how to use [LoRA](https://huggingface.co/papers/2106.09685) from 🤗 PEFT to fine-tune an image classification model by ONLY using **0.77%** of the original trainable parameters of the model. \n", + "\n", + "LoRA adds low-rank \"update matrices\" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://huggingface.co/papers/2106.09685). \n", + "\n", + "Let's get started by installing the dependencies. \n", + "\n", + "__*Note that this notebook builds on top the [official image classification example notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb).*__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0a_bETbqv4P7" + }, + "source": [ + "## Install dependencies\n", + "\n", + "Here we're installing `peft` from source to ensure we have access to all the bleeding edge features of `peft`. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Z0_5BYt8hobv", + "outputId": "aafcbc39-b972-493a-8922-2141b1621926" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m53.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 KB\u001b[0m \u001b[31m24.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 KB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.8/462.8 KB\u001b[0m \u001b[31m46.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m102.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 KB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m15.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.3/76.3 MB\u001b[0m \u001b[31m23.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 KB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install transformers accelerate evaluate datasets git+https://github.com/huggingface/peft -q" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y8dSVHoIv7HC" + }, + "source": [ + "## Authentication\n", + "\n", + "We will share our fine-tuned model at the end of training. So, to do that we just authenticate using our 🤗 token. This token is available from [here](https://huggingface.co/settings/tokens). If you don't have a 🤗 account already, we highly encourage you to do so; it's free!" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359, + "referenced_widgets": [ + "5d2f5fb454bc4c16b520e4e96381758f", + "dfd2baceac524fe29c0f4a8443b60a71", + "90d8e83a6af54184a82e0b81ae7054b9", + "1f96ca356b6f41b59275abe93df33f43", + "eef81e9bea0c4f5d85e7efa8ebe0463a", + "cab6d36980c0423fb75299c09c33facc", + "dd38a658218d42d7b051c66de4d4180a", + "f34be236ef9c42448ecf2957160990f7", + "38deee504dab482983a8b8f340472282", + "b2688e34899a449e8d1f6ddb5a66bb85", + "dd4edb4de5e14dfbbee418dba0bb3573", + "516c6d75bc654d62b95ac235ce84c59c", + "14c23f636609458ca4493854826c1a8e", + "c778798c234d45b5a4ae2f250e3706f9", + "d5c5396ea2f54ff0aeb9be58b59c253b", + "15bd2dcdbf4b4e74b9db09bdb8822e61", + "ecf73dd75420460399bfd04d8cd81f90" + ] + }, + "id": "31Zv6rFYr37d", + "outputId": "6476ebcf-6d71-4b7d-ee38-dc4f8e8d024e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token is valid.\n", + "Your token has been saved in your configured git credential helpers (store).\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AX7aJaIKjbCF" + }, + "source": [ + "## Check the library versions" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ejkn8GBzh_DB", + "outputId": "777afbdf-e026-43d8-8efa-80bb958d0ca3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "===================================BUG REPORT===================================\n", + "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "================================================================================\n" + ] + } + ], + "source": [ + "import transformers\n", + "import accelerate\n", + "import peft" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A833xxo3ir28", + "outputId": "da71ef1c-b6d7-43e2-a78b-23556785ef02" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Transformers version: 4.26.0\n", + "Accelerate version: 0.16.0\n", + "PEFT version: 0.1.0.dev0\n" + ] + } + ], + "source": [ + "print(f\"Transformers version: {transformers.__version__}\")\n", + "print(f\"Accelerate version: {accelerate.__version__}\")\n", + "print(f\"PEFT version: {peft.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Po1Ve9u5v_Ul" + }, + "source": [ + "## Select a model checkpoint to fine-tune" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "vhvCQpP-isJr" + }, + "outputs": [], + "source": [ + "model_checkpoint = \"google/vit-base-patch16-224-in21k\" # pre-trained model from which to fine-tune" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UKN3rMAsjgEz" + }, + "source": [ + "## Load a dataset\n", + "\n", + "We're only loading the first 5000 instances from the training set of the [Food-101 dataset](https://huggingface.co/datasets/food101) to keep this example runtime short. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 379, + "referenced_widgets": [ + "61b957d3b51643f78a921979072fe3b6", + "d7136a7b3d0040d580508fc665b9fb00", + "5ee5e11191fc46dd92d4c2f1a7d6d9da", + "3587d42fa09b4fcdb365956a9bb07c77", + "c1ed0b68884c4d4291cd67c0e685ef18", + "9102cc38ee9942ac91dc66eda069ddcb", + "416c65eedcea4a6ea69dae317de79bca", + "128677e1b5b14e63b06b0f81c9cc4df0", + "22da54e68b1d48f9b3ba55ac1ca56873", + "16c7db587b8e475fa3aa9677385b092a", + "23c608994006427caca7975e0d81271f", + "71f7296ec9be4d9abe1af581722b40fe", + "b98e53eefc1944f193169c4f7a72b799", + "1d4a5a5b7d1645a8bf8133935e173082", + "d29e3b9102f14f3385e47ae6e27d1ab1", + "1e3e374b08964a689cfaac9c826f207b", + "b377e94780fb4e1db3b9678717e04fc1", + "86103f87819b440b8464f4460f50375e", + "c3178221dc074657bc0e585c4cfe326d", + "3c6842e0158b4dcc9b93eddfe3279d2a", + "fc612aaed5644b84959a1958b0240dda", + "36c8300bcbb84627a03b94f0eea86ce9", + "f0b0cad40fbd461ca7bdcdbb5f442f57", + "76cf84387a7c43608ad018188eef4114", + "68ef0c8550ee4c00aa8b284d48572610", + "58e7f5c36d8b4836a868ce89838f1896", + "9b216287b8694bcc9960a356adf15504", + "4d653faaedcd497d863bbf2c429ce925", + "d10f2e9c25f2417f9728aa8e43acf677", + "7a35d0ddc2da4dd69068214b87bcdd7f", + "9aac38a8c8694c67a34b2fced0e1a706", + "9d1dff20634a403fb8829469d74301aa", + "b2e64f35be2d4fa3bc95c769b78e1dd1", + "fdf282b234fe4a1a8ab452ac04511b7d", + "59792e1ee7074f998d5d4494c09061c6", + "cd5b2433cc404ac7b1bb35c6a55f6874", + "7c1b6f271fff4d60be39d291c73bfb75", + "074f38bd3a9d49719188e8860fb1b5d3", + "3ff84efe0edc491c884898424be4ae71", + "5a79a196fd7b49128a9647347f85b364", + "851fb5ac25db4bb287a6dbe948278eec", + "471d44c8e49e42b89302ef53ab0eb316", + "4089323832d04dc2a40e238b5fa256cc", + "2d867c65533e482a96db93bc5a09b8cb", + "849ac914c3cc49d29d619dd4f532d74c", + "4cb3d75f80434b48beb6aa4b07c86dfe", + "ff39519704b64e68b69ec06aea02791e", + "e0f2599ed04c424f896e503630034e84", + "1674d568877048368c842c21ffaac811", + "0957e36be17c43fd89462c5d5ddcec1b", + "dffe636233c84dcd9d75f34baf40fa1d", + "dcefc9ba538e4da2b75f9372a4c5b5bf", + "77df794cb4e4491e80ee20bbd2801a89", + "7e243f4a30c645b080e688fb706b4548", + "db6b68a237cf4e93ae6383448b773e47", + "c580d3a6e99e48fab09b3ce799711802", + "4afba780d0f244548a7f28db15b41dc9", + "4e3d482feec9485590d277dfc1d0b3d3", + "23436ea247dd43d8829ca143a49637c5", + "9609eaf0792345b2ab457cb7188ee14a", + "1839e4ed1d3c4975b34c3c050052693f", + "1b31bfb0ef4c404698eb2205414170af", + "6350637718344d65a757d2919de8c1ab", + "42a16474e41343b2a7d46e5930b41b89", + "ce16ac2b3ff244e6bd7dd58daa9f4f7f", + "d25f3ebb577749d89e2e6d2a72f6ca5f", + "13279e67c4d847e4846e2d34e8aac589", + "d7d43177c750412cb1522eb08c01d2d9", + "70b04a3579a5446f94acd422c70ac50a", + "43940212a87d410c82cc9cd15f38a97e", + "19fd7c60287b43bbb6e0b12c25b4b375", + "9013fd35e17f44bfb7a068833adaf167", + "a849dcc9c7f742d49c874597d8c693c5", + "039dd9a4b99e433088a0acd8ba7b519b", + "17235d013b7c4cee996d0bbc1cc6c70c", + "3db63ba25e7349a785244c367d53813e", + "4748461200ae4af883577e2fbb8cb686", + "19ec79f5a5174aa3b26861a9662951d3", + "57c15c64c2374f06a1e0a36bab953ef7", + "06cf9f29b929412a8092044e25861f1c", + "c2032e5054ac4604832957cb6e2e69ca", + "d1ff50e1b871429a85df8cf10e73ffb1", + "10c4f5677d1c4af8b3370b7fb1255065", + "603dd1541db345879295edc16ace2b0c", + "375ac7a15cea4ce3aa484a806cc82717", + "6b6459f123ef4f24a550cd9ec3c9f809", + "e3047557ae7f40e2aecccf1afad36f3f", + "4fc212af0c9b45ebbe334e3dd7f11b59", + "fee4fba960ac41ed97984467da41f319", + "ee103846621b4c0e8e1266599b99f6ee", + "dcd1c1f4fc014c4aa9ebdaf3c533a061", + "a29d758fb7f147c7ad1108f140caf23a", + "cb52fa97c659430a8bd71dcd76245a7f", + "e7144551e74b46529b00a61f580a183d", + "9b1bfa11ee3746c38155c4505abfaa86", + "26520bc6555d41d9951ea0219dc4b5d7", + "60472b5a360f43e89e39d641dabba57b", + "aa9b6ac2785c4a5abd1189edd60698eb", + "cfab815edc1f42898b656c0f4a3b366b", + "c5718d031b9942f4b8bf331a8543db29", + "35d862a4f00c4493920da3e2eb92b043", + "16b464f168d844cba5eb0c91ab4fb91c", + "af5231ecf6e2489b80cdcd435b5e3451", + "62a0f83cf75d4c59a0601c5ad3a817a7", + "b48f685dc91540f38690f39eace724d5", + "ce4b6a4b6fec4ceb907fa436ff940bd2", + "28f82c8fc9cf46c7858132a77e45834b", + "ce18faf7b68140a3a8247330b356e05b", + "af6a4a054a5d451b9fe256bf60a09c21", + "afb1f0681bce47e1ba718900d0430f34" + ] + }, + "id": "rI0d2_liitUr", + "outputId": "4ae986eb-6cbb-4d9f-bb99-1ffbb05ee835" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "61b957d3b51643f78a921979072fe3b6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading builder script: 0%| | 0.00/6.21k [00:00\n", + " \n", + " \n", + " [45/45 04:44, Epoch 5/5]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation LossAccuracy
1No log0.5068710.896000
22.1627000.1891410.946000
30.3451000.1447590.960000
40.2116000.1508860.958000
50.1711000.1497510.958000

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n", + "Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-9\n", + "Configuration saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-9/config.json\n", + "Model weights saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-9/pytorch_model.bin\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-9/preprocessor_config.json\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/preprocessor_config.json\n", + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n", + "Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-18\n", + "Configuration saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-18/config.json\n", + "Model weights saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-18/pytorch_model.bin\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-18/preprocessor_config.json\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/preprocessor_config.json\n", + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n", + "Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-27\n", + "Configuration saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-27/config.json\n", + "Model weights saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-27/pytorch_model.bin\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-27/preprocessor_config.json\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/preprocessor_config.json\n", + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n", + "Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-36\n", + "Configuration saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-36/config.json\n", + "Model weights saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-36/pytorch_model.bin\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-36/preprocessor_config.json\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/preprocessor_config.json\n", + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n", + "Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-45\n", + "Configuration saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-45/config.json\n", + "Model weights saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-45/pytorch_model.bin\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-45/preprocessor_config.json\n", + "Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-food101/preprocessor_config.json\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "Loading best model from vit-base-patch16-224-in21k-finetuned-lora-food101/checkpoint-27 (score: 0.96).\n" + ] + } + ], + "source": [ + "trainer = Trainer(\n", + " lora_model,\n", + " args,\n", + " train_dataset=train_ds,\n", + " eval_dataset=val_ds,\n", + " processing_class=image_processor,\n", + " compute_metrics=compute_metrics,\n", + " data_collator=collate_fn,\n", + ")\n", + "train_results = trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b2NENHxHCejv" + }, + "source": [ + "In just a few minutes, we have a fine-tuned model with 96% validation accuracy. Also, note that we used a very small subset of the training dataset which is definitely impacting the results. " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 198 + }, + "id": "_MAd2906jQKG", + "outputId": "7b825531-4e8d-4666-d6fc-eaa1accf3bb6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 500\n", + " Batch size = 128\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [4/4 01:48]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'eval_loss': 0.14475855231285095,\n", + " 'eval_accuracy': 0.96,\n", + " 'eval_runtime': 3.5725,\n", + " 'eval_samples_per_second': 139.958,\n", + " 'eval_steps_per_second': 1.12,\n", + " 'epoch': 5.0}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.evaluate(val_ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qo_scDEyAQER" + }, + "source": [ + "## Sharing your model and inference \n", + "\n", + "Once the fine-tuning is done, we can share the LoRA parameters with the community like so: " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 172, + "referenced_widgets": [ + "980d2f61332f414d9888b76e78774be4", + "cbc0ba8e49a740fcae7b94fe7edb8107", + "b3f07ef160a7425880ffe362008d4400", + "49ed3330fa9645eda8b5aed0fd7cbafe", + "f96302d0c2d849c5b5a0206b65e461ab", + "fcc7ad16a0b14d96acd9be9e03ac6af9", + "1a08961f063346ccae206a863ab7df6b", + "1150e391e753424da7d65bda10463da4", + "e6e36d744e1244aeb7eb0c4ce392372d", + "b2ad992db5a045668fa55c7393ec7870", + "b5fad0f3f2d543ecaed726e52d2d86bb", + "e83fd078f467406da0baf26e18b39e89", + "9b4b67731a7a4bc59be132b53c24eae8", + "e33243b001274d02a25f5940ba41ecf6", + "06e4c619e366427a8ff4c358196ecd12", + "bacd429b42d843299cb75224db3afb1e", + "c53429e699e64b3d8895a355bbd947a6", + "2bfd04824f2e4fd6844dd38e46dbbbdf", + "40a5a50aeca24f0d8990da97971004d1", + "be71e6438e0e49759d2f72feec520cae", + "1902fbc4c1da4ffe90f0947e58eb5d48", + "12a94351242444d7b0d23b8accc1824a" + ] + }, + "id": "TyQvIcnFzLIV", + "outputId": "7ac2819e-080e-4940-9755-15c32832d9a6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading the following files to sayakpaul/vit-base-patch16-224-in21k-finetuned-lora-food101: adapter_config.json,adapter_model.bin\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "980d2f61332f414d9888b76e78774be4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from PIL import Image\n", + "import requests\n", + "\n", + "url = \"https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/beignets.jpeg\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dFuqZgmCW4cu" + }, + "source": [ + "We first instantiate an `image_processor` from the underlying model repo. " + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dN4x_pj-VQx8", + "outputId": "b4fddd49-2f10-48e2-de31-0cefd20d405d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--sayakpaul--vit-base-patch16-224-in21k-finetuned-lora-food101/snapshots/fa2503cc7d91e0dd69728c1dc66ed80d7bd3289b/preprocessor_config.json\n", + "Image processor ViTImageProcessor {\n", + " \"do_normalize\": true,\n", + " \"do_rescale\": true,\n", + " \"do_resize\": true,\n", + " \"image_mean\": [\n", + " 0.5,\n", + " 0.5,\n", + " 0.5\n", + " ],\n", + " \"image_processor_type\": \"ViTImageProcessor\",\n", + " \"image_std\": [\n", + " 0.5,\n", + " 0.5,\n", + " 0.5\n", + " ],\n", + " \"resample\": 2,\n", + " \"rescale_factor\": 0.00392156862745098,\n", + " \"size\": {\n", + " \"height\": 224,\n", + " \"width\": 224\n", + " }\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "image_processor = AutoImageProcessor.from_pretrained(repo_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dc0rCwC5XAaL" + }, + "source": [ + "We then prepare the sample for inference." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "57C6tcdnVYu1", + "outputId": "f164dd91-8679-482e-fb10-2f530d4ea9f4" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 3, 224, 224])\n" + ] + } + ], + "source": [ + "# prepare image for the model\n", + "encoding = image_processor(image.convert(\"RGB\"), return_tensors=\"pt\")\n", + "print(encoding.pixel_values.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pn4T1GyTXC47" + }, + "source": [ + "And run inference!" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YyQznW5WViMc", + "outputId": "d1b7a77c-68b3-4f6b-a945-4b32e85baabe" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicted class: beignets\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# forward pass\n", + "with torch.no_grad():\n", + " outputs = inference_model(**encoding)\n", + " logits = outputs.logits\n", + "\n", + "predicted_class_idx = logits.argmax(-1).item()\n", + "print(\"Predicted class:\", inference_model.config.id2label[predicted_class_idx])" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "0a_bETbqv4P7", + "Y8dSVHoIv7HC", + "qo_scDEyAQER" + ], + "machine_shape": "hm", + "provenance": [] + }, + "gpuClass": "premium", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2" + }, + "vscode": { + "interpreter": { + "hash": "62ba1781de76fc6672ab4d41176558d38a2895b3007f2161f5f79f77fdcaf8cf" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "02a0b01d31a34a1c924786037fecba09": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "02ac19466e24404a92e769ed60604881": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "039dd9a4b99e433088a0acd8ba7b519b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "041f73c9a038411aa6d59cf8a93f6d47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de92f68231294aefb249f400475bc9a4", + "placeholder": "​", + "style": "IPY_MODEL_f981fb4aae504045aa10889dceeb6cac", + "value": " 357/357 [01:17<?, ?B/s]" + } + }, + "0465571b25714ecda9dfe6ff1a495a87": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "06cf9f29b929412a8092044e25861f1c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_375ac7a15cea4ce3aa484a806cc82717", + "max": 489429, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6b6459f123ef4f24a550cd9ec3c9f809", + "value": 489429 + } + }, + "06e4c619e366427a8ff4c358196ecd12": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1902fbc4c1da4ffe90f0947e58eb5d48", + "placeholder": "​", + "style": "IPY_MODEL_12a94351242444d7b0d23b8accc1824a", + "value": " 2.69M/2.69M [00:00<00:00, 2.27MB/s]" + } + }, + "070a734e268045098977db14c6565777": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ad2068dd9c2040f6ae44bc873fa7b6e7", + "placeholder": "​", + "style": "IPY_MODEL_9434b43de9954c83a4311432bdd68376", + "value": " 330M/330M [00:39<00:00, 11.7MB/s]" + } + }, + "070fdaf418de43a3a5ee0592e8aca103": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "074f38bd3a9d49719188e8860fb1b5d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07f5e653fe6740e8a71fb9de101884f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0957e36be17c43fd89462c5d5ddcec1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "098770d5a36540dea54d27d7fa9bcd56": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5da11a8c37ae41458ea4491ccdfb4db8", + "IPY_MODEL_23ec20ee5f0e48be9470415810cd0b4b", + "IPY_MODEL_e88c3ad56ef24e4d8281898b08ff6f4b" + ], + "layout": "IPY_MODEL_c5670295387a4c199571a2a21a6b69dc" + } + }, + "0a0e75829d6c4031bc917ac2044d9e47": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0b82dbc29d514f4e9e012fd755948e52": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aa68207e72b0467cb9a4354dc231db2f", + "placeholder": "​", + "style": "IPY_MODEL_60e6952873524186aad05661a00bd240", + "value": "Download file runs/Feb07_02-50-30_319afa680fd7/events.out.tfevents.1675738403.319afa680fd7.10047.2: 100%" + } + }, + "0ba38362cf8647c08b0beb21a2c39442": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0bc5c81047994f5b976a927b8ed47cbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0cd35b1092064f42908ce4123b79a8af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0d75dd458e3448a58ea4e19c28e787c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "10c4f5677d1c4af8b3370b7fb1255065": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1150e391e753424da7d65bda10463da4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "128677e1b5b14e63b06b0f81c9cc4df0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "12a94351242444d7b0d23b8accc1824a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "13279e67c4d847e4846e2d34e8aac589": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d7d43177c750412cb1522eb08c01d2d9", + "IPY_MODEL_70b04a3579a5446f94acd422c70ac50a", + "IPY_MODEL_43940212a87d410c82cc9cd15f38a97e" + ], + "layout": "IPY_MODEL_19fd7c60287b43bbb6e0b12c25b4b375" + } + }, + "136a56e1f70c431bae0a3ac01751a814": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "138198ec50a9494889319d6c94da92bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "148c9912cec5473bb6f8533add143cd3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "14c23f636609458ca4493854826c1a8e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "15bd2dcdbf4b4e74b9db09bdb8822e61": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1639075b181f4945ac32af116b22d1d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cf024daa51f74777b98028df10dbc9c5", + "placeholder": "​", + "style": "IPY_MODEL_eaf2c76a172d4da6846c6face18a3b58", + "value": " 9.99k/9.99k [01:17<?, ?B/s]" + } + }, + "16620105b32f434eb77b0df56ed49e45": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_edb0d1ba5e114af9b6705969f58ece7b", + "placeholder": "​", + "style": "IPY_MODEL_1954a636239b40169659e2ae8ef3b127", + "value": " 9.99k/9.99k [01:17<00:00, 119B/s]" + } + }, + "1674d568877048368c842c21ffaac811": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "16b464f168d844cba5eb0c91ab4fb91c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_28f82c8fc9cf46c7858132a77e45834b", + "max": 25250, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ce18faf7b68140a3a8247330b356e05b", + "value": 25250 + } + }, + "16c7db587b8e475fa3aa9677385b092a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17235d013b7c4cee996d0bbc1cc6c70c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1839e4ed1d3c4975b34c3c050052693f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1902fbc4c1da4ffe90f0947e58eb5d48": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "194dd0bcc350480c9ddd3e4ef17efc3a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1954a636239b40169659e2ae8ef3b127": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "19ec79f5a5174aa3b26861a9662951d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_57c15c64c2374f06a1e0a36bab953ef7", + "IPY_MODEL_06cf9f29b929412a8092044e25861f1c", + "IPY_MODEL_c2032e5054ac4604832957cb6e2e69ca" + ], + "layout": "IPY_MODEL_d1ff50e1b871429a85df8cf10e73ffb1" + } + }, + "19fd7c60287b43bbb6e0b12c25b4b375": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1a08961f063346ccae206a863ab7df6b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1a4ab138be9940f081514b914fdc4623": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b9f5bca0898404b91032befbd019fa3", + "placeholder": "​", + "style": "IPY_MODEL_e4694cffcb574863a255e9022c8ddf5d", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "1b31bfb0ef4c404698eb2205414170af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1c208beced884b9291c5bcb7b4f71680": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_63c07a01593f467f9c0e7c5e283d58ae", + "placeholder": "​", + "style": "IPY_MODEL_972f831792cd4e89af109462dd5b9210", + "value": " 9.99k/9.99k [01:17<?, ?B/s]" + } + }, + "1d4a5a5b7d1645a8bf8133935e173082": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3178221dc074657bc0e585c4cfe326d", + "max": 5560, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3c6842e0158b4dcc9b93eddfe3279d2a", + "value": 5560 + } + }, + "1e3e374b08964a689cfaac9c826f207b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f8c65025b63466192897a32a92182e9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f96ca356b6f41b59275abe93df33f43": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_516c6d75bc654d62b95ac235ce84c59c", + "style": "IPY_MODEL_14c23f636609458ca4493854826c1a8e", + "value": true + } + }, + "1fdc59cbb8724c618ce6e586e2c9723f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21c75049df804ac4ac7bc6349a639056": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5fd1cd8bf125446a96b9438fbbe52710", + "max": 357, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eb864284052c46b28b93fc79bfed740f", + "value": 357 + } + }, + "223a13f77e2e49a09660890eb4213b30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47cf3db935ba4e109843b03a9577c184", + "max": 10227, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_be1ec4b9b8964810867b0e00bcc4868f", + "value": 10227 + } + }, + "22da54e68b1d48f9b3ba55ac1ca56873": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "23436ea247dd43d8829ca143a49637c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ce16ac2b3ff244e6bd7dd58daa9f4f7f", + "placeholder": "​", + "style": "IPY_MODEL_d25f3ebb577749d89e2e6d2a72f6ca5f", + "value": " 2/2 [00:01<00:00, 1.95it/s]" + } + }, + "236638d673934823828ee57face78184": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_920293e203f14b45b61233e1bb6f1214", + "placeholder": "​", + "style": "IPY_MODEL_a1981bfcdb6d401e9a521e18b511cf9d", + "value": " 502/502 [00:00<00:00, 27.9kB/s]" + } + }, + "23c608994006427caca7975e0d81271f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "23ec20ee5f0e48be9470415810cd0b4b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d7c394bc6a3249e9b3fcbae2ebd25eb7", + "max": 5777, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fea27a80cd2f4b4dba84ecdfefd2722c", + "value": 5777 + } + }, + "245c5418ca084fb6bc0b027576a1f789": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_431174d906f640baa17842fdb3a8714b", + "placeholder": "​", + "style": "IPY_MODEL_638b918aaacc4c4782b9e16ca66549e8", + "value": "Clean file runs/Feb07_03-56-51_319afa680fd7/events.out.tfevents.1675742272.319afa680fd7.27769.0: 100%" + } + }, + "24b3737dc76c4d4f9ba2603c653a3ce2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_397dc640630841d7845bf5a8739ce5eb", + "placeholder": "​", + "style": "IPY_MODEL_07f5e653fe6740e8a71fb9de101884f3", + "value": " 5.64k/5.64k [01:17<00:00, 61.5B/s]" + } + }, + "260fdda06c214ed499f69fac4077d476": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "26520bc6555d41d9951ea0219dc4b5d7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "28f82c8fc9cf46c7858132a77e45834b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29de968ad50543418c6865fdf003a568": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ab85fcc8de042d0bdb9ca79b8e404a4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_77deda3ef342432f9b0f684a9b32e248", + "IPY_MODEL_f3991aaad13a4c50a7809483b7907b7b", + "IPY_MODEL_eca3b1f4ad76430483a221470e592c13" + ], + "layout": "IPY_MODEL_a1d5bc95f1e24e3293414e08aa5c8bd5" + } + }, + "2ab99fb38f8d4bef85d9833bb628fa00": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2b8bc04ac3104592bf950e349c034c2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e51957908eb48489357a7c3924ec5c7", + "max": 3579, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_41320a22032c4884affc456f7c6db1c1", + "value": 3579 + } + }, + "2bfd04824f2e4fd6844dd38e46dbbbdf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2c86eb6c67f44af590937d0f1db09333": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2d13b401dcf94089a4a78a62f05bdce3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_070fdaf418de43a3a5ee0592e8aca103", + "placeholder": "​", + "style": "IPY_MODEL_62818f9421694139bbe1d9ad6e822b10", + "value": "Download file runs/Feb07_02-43-38_319afa680fd7/1675737843.2328734/events.out.tfevents.1675737843.319afa680fd7.7189.1: 100%" + } + }, + "2d867c65533e482a96db93bc5a09b8cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2d8a53b2a2bf42b9aa9cb2d9978ccee2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e0bb2dcd85640d7b85d80469ea9f9f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_136a56e1f70c431bae0a3ac01751a814", + "max": 10230, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7a3daf19ee744c7b8baeb028db05009a", + "value": 10230 + } + }, + "2e24b7250ee04fbb810e5d6ade107c51": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2e31d27cc694434aa869896041c72bee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0d75dd458e3448a58ea4e19c28e787c0", + "placeholder": "​", + "style": "IPY_MODEL_0bc5c81047994f5b976a927b8ed47cbc", + "value": " 330M/330M [01:17<00:00, 671kB/s]" + } + }, + "2eae62f1cc46449dba93f5eda0cb3f1c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "323eb0d9dade4c4fa3a9ad2b973dcbe1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_484f7a55438c47f59365242c4753edba", + "IPY_MODEL_c5665d0bc652405c8754474871baab06", + "IPY_MODEL_82f96ac9299a4841a550ad3daa0099d0" + ], + "layout": "IPY_MODEL_f031aaf7fbc648a7b8a2e5faf37df14d" + } + }, + "3587d42fa09b4fcdb365956a9bb07c77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_16c7db587b8e475fa3aa9677385b092a", + "placeholder": "​", + "style": "IPY_MODEL_23c608994006427caca7975e0d81271f", + "value": " 6.21k/6.21k [00:00<00:00, 412kB/s]" + } + }, + "35d862a4f00c4493920da3e2eb92b043": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b48f685dc91540f38690f39eace724d5", + "placeholder": "​", + "style": "IPY_MODEL_ce4b6a4b6fec4ceb907fa436ff940bd2", + "value": "Generating validation split: 100%" + } + }, + "36c8300bcbb84627a03b94f0eea86ce9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "375ac7a15cea4ce3aa484a806cc82717": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "38deee504dab482983a8b8f340472282": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "38f30da546444f8199673003d0a92dda": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "397dc640630841d7845bf5a8739ce5eb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a71257db7bc408d8e4d1fbcaf1dff93": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c0cacee5997480cbedc0e9d59a62544": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c33964c8d804600ab5a26d0717c508d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c6842e0158b4dcc9b93eddfe3279d2a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3db63ba25e7349a785244c367d53813e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3efccb526dec44bf9801ac13dcc1068d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f188d6d34774154afc297b13a3eb9e8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f6394cb0ea242f28c4ba6b3b2d37e9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3ff84efe0edc491c884898424be4ae71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4089323832d04dc2a40e238b5fa256cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "40a5a50aeca24f0d8990da97971004d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "41320a22032c4884affc456f7c6db1c1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "416c65eedcea4a6ea69dae317de79bca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "42a16474e41343b2a7d46e5930b41b89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "431174d906f640baa17842fdb3a8714b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "43940212a87d410c82cc9cd15f38a97e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3db63ba25e7349a785244c367d53813e", + "placeholder": "​", + "style": "IPY_MODEL_4748461200ae4af883577e2fbb8cb686", + "value": " 1.47M/1.47M [00:00<00:00, 1.21MB/s]" + } + }, + "4533e8ce655649cba93553c8a2b17f37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "471d44c8e49e42b89302ef53ab0eb316": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4748461200ae4af883577e2fbb8cb686": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "47659b15eb284f06bf9735ca2e425646": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "47cf3db935ba4e109843b03a9577c184": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4825c09098e1446a9ed3b653b77894f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_148c9912cec5473bb6f8533add143cd3", + "placeholder": "​", + "style": "IPY_MODEL_92dfb889fd22439bb7b5fd31e4991c93", + "value": "Clean file runs/Feb07_03-56-51_319afa680fd7/1675742273.001745/events.out.tfevents.1675742273.319afa680fd7.27769.1: 100%" + } + }, + "483b46ed1e8148498d54e4d6f4c0ca8d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "484f7a55438c47f59365242c4753edba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f22598cf4ade4427a1b437fd45aabcc4", + "placeholder": "​", + "style": "IPY_MODEL_0cd35b1092064f42908ce4123b79a8af", + "value": "Downloading (…)"adapter_model.bin";: 100%" + } + }, + "49ed3330fa9645eda8b5aed0fd7cbafe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b2ad992db5a045668fa55c7393ec7870", + "placeholder": "​", + "style": "IPY_MODEL_b5fad0f3f2d543ecaed726e52d2d86bb", + "value": " 1/1 [00:00<00:00, 1.24it/s]" + } + }, + "4a118fa87e424664a2d2ed7c7f58f3fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a3ff00e64b548ce89355778907e48c9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a44332ff1224a19a5f1c18e2b827759": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4afba780d0f244548a7f28db15b41dc9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1839e4ed1d3c4975b34c3c050052693f", + "placeholder": "​", + "style": "IPY_MODEL_1b31bfb0ef4c404698eb2205414170af", + "value": "Downloading data files: 100%" + } + }, + "4bb8b2d7000f464ba3ff18ce03fcfef4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4cb3d75f80434b48beb6aa4b07c86dfe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0957e36be17c43fd89462c5d5ddcec1b", + "placeholder": "​", + "style": "IPY_MODEL_dffe636233c84dcd9d75f34baf40fa1d", + "value": "Computing checksums: 100%" + } + }, + "4d1f6114d4034f758bf8cc35485e0056": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0ba38362cf8647c08b0beb21a2c39442", + "placeholder": "​", + "style": "IPY_MODEL_65a0aed816c84164a6ee6a41d300fad0", + "value": "Download file runs/Feb07_02-43-38_319afa680fd7/events.out.tfevents.1675737843.319afa680fd7.7189.0: 100%" + } + }, + "4d653faaedcd497d863bbf2c429ce925": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e3d482feec9485590d277dfc1d0b3d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6350637718344d65a757d2919de8c1ab", + "max": 2, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42a16474e41343b2a7d46e5930b41b89", + "value": 2 + } + }, + "4ee1fde44dcf49eda97e1a05173e5bb1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fc212af0c9b45ebbe334e3dd7f11b59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "516c6d75bc654d62b95ac235ce84c59c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52d00532eeee40aa91e8a5c2a10e50a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a65e650113e476a8cb66caa92973dd3", + "placeholder": "​", + "style": "IPY_MODEL_d0c95a20c2664c149886b72fa665d3cf", + "value": "Clean file runs/Feb07_02-50-30_319afa680fd7/events.out.tfevents.1675738403.319afa680fd7.10047.2: 100%" + } + }, + "5634fd283a9e45d9a55c02ca1b7c784c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a44332ff1224a19a5f1c18e2b827759", + "placeholder": "​", + "style": "IPY_MODEL_7e1ac6f28fb340d3bde1e7b4893bb0aa", + "value": " 346M/346M [00:02<00:00, 202MB/s]" + } + }, + "57c15c64c2374f06a1e0a36bab953ef7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_10c4f5677d1c4af8b3370b7fb1255065", + "placeholder": "​", + "style": "IPY_MODEL_603dd1541db345879295edc16ace2b0c", + "value": "Downloading data: 100%" + } + }, + "58e7f5c36d8b4836a868ce89838f1896": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d1dff20634a403fb8829469d74301aa", + "placeholder": "​", + "style": "IPY_MODEL_b2e64f35be2d4fa3bc95c769b78e1dd1", + "value": " 10.3k/10.3k [00:00<00:00, 770kB/s]" + } + }, + "59792e1ee7074f998d5d4494c09061c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ff84efe0edc491c884898424be4ae71", + "placeholder": "​", + "style": "IPY_MODEL_5a79a196fd7b49128a9647347f85b364", + "value": "Downloading data: 100%" + } + }, + "5a79a196fd7b49128a9647347f85b364": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5b7be0df4db54866a3b6ef9204ba5a89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cde9d5cbadf14a5abe294dba0fa5bd2d", + "IPY_MODEL_835db77232e74cc18a6b5db2ace40bfd", + "IPY_MODEL_6099227eddde44009582b9f24fc96150" + ], + "layout": "IPY_MODEL_9eb912a195f3461297b7143cb1b04678" + } + }, + "5d2f5fb454bc4c16b520e4e96381758f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_dfd2baceac524fe29c0f4a8443b60a71", + "IPY_MODEL_90d8e83a6af54184a82e0b81ae7054b9", + "IPY_MODEL_1f96ca356b6f41b59275abe93df33f43", + "IPY_MODEL_eef81e9bea0c4f5d85e7efa8ebe0463a", + "IPY_MODEL_cab6d36980c0423fb75299c09c33facc" + ], + "layout": "IPY_MODEL_dd38a658218d42d7b051c66de4d4180a" + } + }, + "5da11a8c37ae41458ea4491ccdfb4db8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6dac696d99a44ea399a1bd5e18f08428", + "placeholder": "​", + "style": "IPY_MODEL_b3a8eebed60f4ecab7d508c976e2e56b", + "value": "Download file runs/Feb07_03-56-51_319afa680fd7/1675742273.001745/events.out.tfevents.1675742273.319afa680fd7.27769.1: 100%" + } + }, + "5dc4129160514a479ffb2f0564aee071": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e4074e524a19455fab810ec454fe8bf1", + "max": 10230, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d67dc70cfc9246f79a59261a69b28b41", + "value": 10230 + } + }, + "5e51957908eb48489357a7c3924ec5c7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5ee5e11191fc46dd92d4c2f1a7d6d9da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_128677e1b5b14e63b06b0f81c9cc4df0", + "max": 6208, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_22da54e68b1d48f9b3ba55ac1ca56873", + "value": 6208 + } + }, + "5fd1cd8bf125446a96b9438fbbe52710": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "603dd1541db345879295edc16ace2b0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "60472b5a360f43e89e39d641dabba57b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6099227eddde44009582b9f24fc96150": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_af7ee2bb7ccc4c00838a2c6b937e4e8b", + "placeholder": "​", + "style": "IPY_MODEL_84c281446c5b424090a5eecbd733b050", + "value": " 9.99k/9.99k [01:17<00:00, 119B/s]" + } + }, + "60e6952873524186aad05661a00bd240": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "61b957d3b51643f78a921979072fe3b6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d7136a7b3d0040d580508fc665b9fb00", + "IPY_MODEL_5ee5e11191fc46dd92d4c2f1a7d6d9da", + "IPY_MODEL_3587d42fa09b4fcdb365956a9bb07c77" + ], + "layout": "IPY_MODEL_c1ed0b68884c4d4291cd67c0e685ef18" + } + }, + "62818f9421694139bbe1d9ad6e822b10": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "62a0f83cf75d4c59a0601c5ad3a817a7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "6350637718344d65a757d2919de8c1ab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "638b918aaacc4c4782b9e16ca66549e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "63b53da916fe479e8cd495eff8d16df8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "63c07a01593f467f9c0e7c5e283d58ae": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "65a0aed816c84164a6ee6a41d300fad0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "68ef0c8550ee4c00aa8b284d48572610": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a35d0ddc2da4dd69068214b87bcdd7f", + "max": 10337, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9aac38a8c8694c67a34b2fced0e1a706", + "value": 10337 + } + }, + "6932d2462135413cbc293964eb1c8317": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3c0cacee5997480cbedc0e9d59a62544", + "max": 5777, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fee3db0deefb410db4c572efd95575bf", + "value": 5777 + } + }, + "6940a405215c4e2caadbe209c677bde0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "69dcd4770fbc428cb56498b6577e237e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6b6459f123ef4f24a550cd9ec3c9f809": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6dac696d99a44ea399a1bd5e18f08428": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6e4983016e4f465b85ab7a472d0e986e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f19448725b84be4bacc3b699cd065a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3c33964c8d804600ab5a26d0717c508d", + "placeholder": "​", + "style": "IPY_MODEL_f5041033ddf94f459ed8d1747f6b2d6e", + "value": "Downloading (…)"pytorch_model.bin";: 100%" + } + }, + "6f1a325b02f54352a0b412d7f4420bbb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70af909af7de4161b4a72a8e15d116f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70b04a3579a5446f94acd422c70ac50a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_039dd9a4b99e433088a0acd8ba7b519b", + "max": 1468812, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_17235d013b7c4cee996d0bbc1cc6c70c", + "value": 1468812 + } + }, + "71f7296ec9be4d9abe1af581722b40fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b98e53eefc1944f193169c4f7a72b799", + "IPY_MODEL_1d4a5a5b7d1645a8bf8133935e173082", + "IPY_MODEL_d29e3b9102f14f3385e47ae6e27d1ab1" + ], + "layout": "IPY_MODEL_1e3e374b08964a689cfaac9c826f207b" + } + }, + "7541b2304cc5466cb2369c0025d2d243": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "75b76841f06249a0a77c7e38b85a14c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e6ce3e626b1744c7ba3da26d1fde5fa5", + "placeholder": "​", + "style": "IPY_MODEL_8e1237963bb5479f93318c5cdc6a8593", + "value": "Download file runs/Feb07_03-56-51_319afa680fd7/events.out.tfevents.1675742272.319afa680fd7.27769.0: 100%" + } + }, + "7654b707840e4afb9bab8218418fd096": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "76cf84387a7c43608ad018188eef4114": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d653faaedcd497d863bbf2c429ce925", + "placeholder": "​", + "style": "IPY_MODEL_d10f2e9c25f2417f9728aa8e43acf677", + "value": "Downloading readme: 100%" + } + }, + "77deda3ef342432f9b0f684a9b32e248": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b48fcbe51098482aad8798670111d60d", + "placeholder": "​", + "style": "IPY_MODEL_7654b707840e4afb9bab8218418fd096", + "value": "Clean file runs/Feb07_02-43-38_319afa680fd7/1675737843.2328734/events.out.tfevents.1675737843.319afa680fd7.7189.1: 100%" + } + }, + "77df794cb4e4491e80ee20bbd2801a89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "77f7230186b14c628f5094f9fd8d82da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_855a0f70b9ac489a86b53792e119329a", + "IPY_MODEL_9005e9db560d4e89880bdd18403ef9e1", + "IPY_MODEL_070a734e268045098977db14c6565777" + ], + "layout": "IPY_MODEL_91ec8a3f10804d629cdfd47c61411c91" + } + }, + "79351db5d2e1468e9b91d7bd2274612e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2d8a53b2a2bf42b9aa9cb2d9978ccee2", + "max": 10824, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_69dcd4770fbc428cb56498b6577e237e", + "value": 10824 + } + }, + "793ebaa3acc6482bb135ca0ca864be4d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7a35d0ddc2da4dd69068214b87bcdd7f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a3daf19ee744c7b8baeb028db05009a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7a65e650113e476a8cb66caa92973dd3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b8f0cbb552447549aed602f937fcfb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bd32cf88c154303a76759d674795856": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a907de6474cf45cb91b3f2efc40821b9", + "IPY_MODEL_5dc4129160514a479ffb2f0564aee071", + "IPY_MODEL_16620105b32f434eb77b0df56ed49e45" + ], + "layout": "IPY_MODEL_7b8f0cbb552447549aed602f937fcfb4" + } + }, + "7c038ffcc1dc4e3fbfed17d94327353a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c1b6f271fff4d60be39d291c73bfb75": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4089323832d04dc2a40e238b5fa256cc", + "placeholder": "​", + "style": "IPY_MODEL_2d867c65533e482a96db93bc5a09b8cb", + "value": " 5.00G/5.00G [03:32<00:00, 24.6MB/s]" + } + }, + "7d5831ee2a1c4e649f5508631d64e7cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2eae62f1cc46449dba93f5eda0cb3f1c", + "placeholder": "​", + "style": "IPY_MODEL_e8026bcb0e2c4b14bc6c84537c8c4ae9", + "value": " 5.64k/5.64k [01:17<?, ?B/s]" + } + }, + "7e1ac6f28fb340d3bde1e7b4893bb0aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e243f4a30c645b080e688fb706b4548": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "828a652d92724ba4888d924846a79374": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4bb8b2d7000f464ba3ff18ce03fcfef4", + "max": 4203, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_793ebaa3acc6482bb135ca0ca864be4d", + "value": 4203 + } + }, + "82f96ac9299a4841a550ad3daa0099d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a71257db7bc408d8e4d1fbcaf1dff93", + "placeholder": "​", + "style": "IPY_MODEL_ccd34ccf2c864c609a0b4fcee7327b31", + "value": " 2.69M/2.69M [00:00<00:00, 13.0MB/s]" + } + }, + "835db77232e74cc18a6b5db2ace40bfd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d2469e1f1daf4d4cb0faf35ce90f6445", + "max": 10227, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e667b14a3c0e41c6a16c4be453f10378", + "value": 10227 + } + }, + "8378c214cd044bfb97c452d811df748f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b6a1b7db4afe44c792907f6377cde35c", + "placeholder": "​", + "style": "IPY_MODEL_dca7d0a0d2aa479083d81a54489d3717", + "value": "Downloading builder script: 100%" + } + }, + "83d6fbf463264c71a4ec8775e26c7c38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "849ac914c3cc49d29d619dd4f532d74c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4cb3d75f80434b48beb6aa4b07c86dfe", + "IPY_MODEL_ff39519704b64e68b69ec06aea02791e", + "IPY_MODEL_e0f2599ed04c424f896e503630034e84" + ], + "layout": "IPY_MODEL_1674d568877048368c842c21ffaac811" + } + }, + "84c281446c5b424090a5eecbd733b050": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "851fb5ac25db4bb287a6dbe948278eec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "852b01d8592b4d8aa2c4297d6cf75f78": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "855a0f70b9ac489a86b53792e119329a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0465571b25714ecda9dfe6ff1a495a87", + "placeholder": "​", + "style": "IPY_MODEL_dc078f0db3e54199bef0c11ee5e6297e", + "value": "Clean file pytorch_model.bin: 100%" + } + }, + "86103f87819b440b8464f4460f50375e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "864b6bb42f0b46a2a7bcd0d8cbac3837": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c1af5e6c4259480eac652f6c6269ff5f", + "max": 345636463, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_194dd0bcc350480c9ddd3e4ef17efc3a", + "value": 345636463 + } + }, + "86fca0e29e7a4dc8b2234134014958f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8378c214cd044bfb97c452d811df748f", + "IPY_MODEL_828a652d92724ba4888d924846a79374", + "IPY_MODEL_cfd59ddfe85f4585865df8df47fd491f" + ], + "layout": "IPY_MODEL_94f39a2f3baa4bb2bffd1b99e8a31367" + } + }, + "87581c98cd174bb684ec259066d047ea": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87aad727ec964c9d97346ac02ed0caae": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8886c333aa104900a3bb4a1904756661": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8934f66530644f0882e292bfd5458b0f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8a0a77b9ebd74caabb8f8a764c289a5c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b7713310a814991aec7929fa715ec7c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b9f5bca0898404b91032befbd019fa3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ba66e043f8a4975bd77ecd343401260": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1f8c65025b63466192897a32a92182e9", + "placeholder": "​", + "style": "IPY_MODEL_ffc13c11355b46bb9cafcb17f3e1535e", + "value": "Download file runs/Feb07_02-50-30_319afa680fd7/events.out.tfevents.1675738246.319afa680fd7.10047.0: 100%" + } + }, + "8bf8a843d65142bbad81de74aa8573f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8da58936a6e64529af9a3e3f314e49cb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e1237963bb5479f93318c5cdc6a8593": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8ecde04d15ab47f9b78d561615ca567d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9005e9db560d4e89880bdd18403ef9e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c3f7788abe754cb3bfbee3fadda54916", + "max": 345949677, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fe93399cc15f4f29b6a37f6a65cf8c9b", + "value": 345949677 + } + }, + "9013fd35e17f44bfb7a068833adaf167": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "90d8e83a6af54184a82e0b81ae7054b9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_b2688e34899a449e8d1f6ddb5a66bb85", + "placeholder": "​", + "style": "IPY_MODEL_dd4edb4de5e14dfbbee418dba0bb3573", + "value": "" + } + }, + "9102cc38ee9942ac91dc66eda069ddcb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9123141f7c164d458a21e54fc579fa66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "91ec8a3f10804d629cdfd47c61411c91": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91f6edc592394a0bad250e68d3c22017": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "920293e203f14b45b61233e1bb6f1214": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "923cf8641a7946f69ff41fb88b2b86f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9d14ba8675fb4c689dd821ce7794abb6", + "IPY_MODEL_6932d2462135413cbc293964eb1c8317", + "IPY_MODEL_7d5831ee2a1c4e649f5508631d64e7cc" + ], + "layout": "IPY_MODEL_70af909af7de4161b4a72a8e15d116f3" + } + }, + "9281c5aec5b84411a05e4762125388d9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92dfb889fd22439bb7b5fd31e4991c93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "93788683ef8e4c71bc1c0b3b9cc7219c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2671f512e404f64bfa3f376449f6947", + "placeholder": "​", + "style": "IPY_MODEL_a61a30ebaac846c1b7a03c6a93127aad", + "value": " 357/357 [01:17<?, ?B/s]" + } + }, + "9397ebc3ad2e4141a1405bb1bd0aa315": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6f19448725b84be4bacc3b699cd065a9", + "IPY_MODEL_864b6bb42f0b46a2a7bcd0d8cbac3837", + "IPY_MODEL_5634fd283a9e45d9a55c02ca1b7c784c" + ], + "layout": "IPY_MODEL_87aad727ec964c9d97346ac02ed0caae" + } + }, + "93c81a011c0a435aa90a3f4f1d549510": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bcbb4d8ce16b473eae2ad03f1bea2520", + "max": 160, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2c86eb6c67f44af590937d0f1db09333", + "value": 160 + } + }, + "9434b43de9954c83a4311432bdd68376": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "94f39a2f3baa4bb2bffd1b99e8a31367": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "95449e7030324f99b148bbaedc15c155": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_be1ae63f3e804e23abe7739e9f4577fb", + "IPY_MODEL_d5b95aa9cab446f88d61e9f4a25a8e2f", + "IPY_MODEL_2e31d27cc694434aa869896041c72bee" + ], + "layout": "IPY_MODEL_4a3ff00e64b548ce89355778907e48c9" + } + }, + "9609eaf0792345b2ab457cb7188ee14a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "972f831792cd4e89af109462dd5b9210": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "980d2f61332f414d9888b76e78774be4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cbc0ba8e49a740fcae7b94fe7edb8107", + "IPY_MODEL_b3f07ef160a7425880ffe362008d4400", + "IPY_MODEL_49ed3330fa9645eda8b5aed0fd7cbafe" + ], + "layout": "IPY_MODEL_f96302d0c2d849c5b5a0206b65e461ab" + } + }, + "98d32ec7fbf54effadb886bc4ec6ce79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a5e108d8b5a41ae95a619bfc6c8f3a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9aac38a8c8694c67a34b2fced0e1a706": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9b1bfa11ee3746c38155c4505abfaa86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9b216287b8694bcc9960a356adf15504": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9b4b67731a7a4bc59be132b53c24eae8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c53429e699e64b3d8895a355bbd947a6", + "placeholder": "​", + "style": "IPY_MODEL_2bfd04824f2e4fd6844dd38e46dbbbdf", + "value": "adapter_model.bin: 100%" + } + }, + "9d14ba8675fb4c689dd821ce7794abb6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d04c1c4d04fc4928b4a2a0e860f996e0", + "placeholder": "​", + "style": "IPY_MODEL_98d32ec7fbf54effadb886bc4ec6ce79", + "value": "Download file runs/Feb07_02-50-30_319afa680fd7/1675738246.1183074/events.out.tfevents.1675738246.319afa680fd7.10047.1: 100%" + } + }, + "9d1b9ac29dcc41e08ada578916f20a3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ef7c7fe37c8d459da6d20f4ccbea3fb8", + "IPY_MODEL_93c81a011c0a435aa90a3f4f1d549510", + "IPY_MODEL_da87efdf06d74b0aba268320ba7882f9" + ], + "layout": "IPY_MODEL_0a0e75829d6c4031bc917ac2044d9e47" + } + }, + "9d1dff20634a403fb8829469d74301aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9eb912a195f3461297b7143cb1b04678": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9fb3579ca9714141a7857a513c379f03": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a0929e66406644dbb09bbdc9c58d488d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a100435005a34d428b9ae615f49bb1a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1981bfcdb6d401e9a521e18b511cf9d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a1d5bc95f1e24e3293414e08aa5c8bd5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1df731c5c5f4f9cafa19323a750ebea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_260fdda06c214ed499f69fac4077d476", + "placeholder": "​", + "style": "IPY_MODEL_aed3b4e6110442398c25d37456b78b5d", + "value": "Clean file runs/Feb07_02-50-30_319afa680fd7/1675738246.1183074/events.out.tfevents.1675738246.319afa680fd7.10047.1: 100%" + } + }, + "a2671f512e404f64bfa3f376449f6947": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a29d758fb7f147c7ad1108f140caf23a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aa9b6ac2785c4a5abd1189edd60698eb", + "placeholder": "​", + "style": "IPY_MODEL_cfab815edc1f42898b656c0f4a3b366b", + "value": " 75750/75750 [00:59<00:00, 1687.83 examples/s]" + } + }, + "a2b51be9304342e39431b82957eb4b25": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a450c318d99a477c9f7341458ad4bc8d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4533e8ce655649cba93553c8a2b17f37", + "placeholder": "​", + "style": "IPY_MODEL_63b53da916fe479e8cd495eff8d16df8", + "value": " 10.6k/10.6k [01:17<00:00, 127B/s]" + } + }, + "a61a30ebaac846c1b7a03c6a93127aad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a849dcc9c7f742d49c874597d8c693c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a907de6474cf45cb91b3f2efc40821b9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3f6394cb0ea242f28c4ba6b3b2d37e9f", + "placeholder": "​", + "style": "IPY_MODEL_02a0b01d31a34a1c924786037fecba09", + "value": "Clean file runs/Feb07_02-50-30_319afa680fd7/events.out.tfevents.1675738246.319afa680fd7.10047.0: 100%" + } + }, + "aa68207e72b0467cb9a4354dc231db2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aa9b6ac2785c4a5abd1189edd60698eb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "acae77f181ed43a1b29412c575435a7f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e54b7fc2f9b94118ab97f2736862f77d", + "placeholder": "​", + "style": "IPY_MODEL_dfe97442852c4338843c65333b25623d", + "value": " 5.64k/5.64k [01:17<00:00, 61.5B/s]" + } + }, + "ad2068dd9c2040f6ae44bc873fa7b6e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ad6adbe84ac940ffbf89017a269a3e75": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "adb09cebab13484a8d75a338eaba7b0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9fb3579ca9714141a7857a513c379f03", + "max": 3579, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_edc0742a08a445a594139200c7f03c60", + "value": 3579 + } + }, + "aed3b4e6110442398c25d37456b78b5d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "af1a42626ba7452189fbb5987b159b9c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d4aa1670fdab463bb0a0e6fe104988bc", + "max": 357, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8934f66530644f0882e292bfd5458b0f", + "value": 357 + } + }, + "af5231ecf6e2489b80cdcd435b5e3451": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_af6a4a054a5d451b9fe256bf60a09c21", + "placeholder": "​", + "style": "IPY_MODEL_afb1f0681bce47e1ba718900d0430f34", + "value": " 25250/25250 [00:42<00:00, 617.60 examples/s]" + } + }, + "af6a4a054a5d451b9fe256bf60a09c21": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "af7ee2bb7ccc4c00838a2c6b937e4e8b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "afb1f0681bce47e1ba718900d0430f34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b1b6922df40c4af69b00b4e85db770c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f6a9243d46cb4c0fbdf3f80f7074f6c5", + "placeholder": "​", + "style": "IPY_MODEL_a2b51be9304342e39431b82957eb4b25", + "value": "Clean file training_args.bin: 100%" + } + }, + "b21331417d084aba80f919b71933bc2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b2688e34899a449e8d1f6ddb5a66bb85": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b2ad992db5a045668fa55c7393ec7870": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b2e64f35be2d4fa3bc95c769b78e1dd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b377e94780fb4e1db3b9678717e04fc1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b3a8eebed60f4ecab7d508c976e2e56b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b3f07ef160a7425880ffe362008d4400": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1150e391e753424da7d65bda10463da4", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e6e36d744e1244aeb7eb0c4ce392372d", + "value": 1 + } + }, + "b48f685dc91540f38690f39eace724d5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b48fcbe51098482aad8798670111d60d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b5fad0f3f2d543ecaed726e52d2d86bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b6a1b7db4afe44c792907f6377cde35c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8e180259fd94096884f7e48a53b0fce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b98e53eefc1944f193169c4f7a72b799": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b377e94780fb4e1db3b9678717e04fc1", + "placeholder": "​", + "style": "IPY_MODEL_86103f87819b440b8464f4460f50375e", + "value": "Downloading metadata: 100%" + } + }, + "bacd429b42d843299cb75224db3afb1e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb453686ce9f4342aaae9a9fb3500d2c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1a4ab138be9940f081514b914fdc4623", + "IPY_MODEL_cc59f6643acb4054ad6df56e90d3d2a8", + "IPY_MODEL_236638d673934823828ee57face78184" + ], + "layout": "IPY_MODEL_29de968ad50543418c6865fdf003a568" + } + }, + "bc71a433928e4870b56a3d81e35e6351": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bcbb4d8ce16b473eae2ad03f1bea2520": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bcef9cf2b00c46878f07c48875f7d194": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bd9e7cb0f25445739ebcdff0d3112052": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_52d00532eeee40aa91e8a5c2a10e50a7", + "IPY_MODEL_21c75049df804ac4ac7bc6349a639056", + "IPY_MODEL_041f73c9a038411aa6d59cf8a93f6d47" + ], + "layout": "IPY_MODEL_b8e180259fd94096884f7e48a53b0fce" + } + }, + "bda7e5662f2e4fa292752efd4947c5f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4825c09098e1446a9ed3b653b77894f4", + "IPY_MODEL_f544720498e44c49add78550b46edb3a", + "IPY_MODEL_24b3737dc76c4d4f9ba2603c653a3ce2" + ], + "layout": "IPY_MODEL_2ab99fb38f8d4bef85d9833bb628fa00" + } + }, + "be1ae63f3e804e23abe7739e9f4577fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d947ec84b16c4781959427b610328ab9", + "placeholder": "​", + "style": "IPY_MODEL_9123141f7c164d458a21e54fc579fa66", + "value": "Download file pytorch_model.bin: 100%" + } + }, + "be1ec4b9b8964810867b0e00bcc4868f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "be29ee88a7ec489b8320f7306d78931d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a1df731c5c5f4f9cafa19323a750ebea", + "IPY_MODEL_bf2e140f54d74df09663d3fcf1660d0c", + "IPY_MODEL_acae77f181ed43a1b29412c575435a7f" + ], + "layout": "IPY_MODEL_87581c98cd174bb684ec259066d047ea" + } + }, + "be71e6438e0e49759d2f72feec520cae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bf2e140f54d74df09663d3fcf1660d0c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8da58936a6e64529af9a3e3f314e49cb", + "max": 5777, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9a5e108d8b5a41ae95a619bfc6c8f3a9", + "value": 5777 + } + }, + "c1af5e6c4259480eac652f6c6269ff5f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c1ed0b68884c4d4291cd67c0e685ef18": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2032e5054ac4604832957cb6e2e69ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e3047557ae7f40e2aecccf1afad36f3f", + "placeholder": "​", + "style": "IPY_MODEL_4fc212af0c9b45ebbe334e3dd7f11b59", + "value": " 489k/489k [00:00<00:00, 2.24MB/s]" + } + }, + "c2f4b407a47f4d958986035188c6ece8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_75b76841f06249a0a77c7e38b85a14c8", + "IPY_MODEL_79351db5d2e1468e9b91d7bd2274612e", + "IPY_MODEL_c6ce0e9bdd90400f9cf2debf9165758c" + ], + "layout": "IPY_MODEL_138198ec50a9494889319d6c94da92bd" + } + }, + "c3178221dc074657bc0e585c4cfe326d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3f7788abe754cb3bfbee3fadda54916": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c53429e699e64b3d8895a355bbd947a6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5665d0bc652405c8754474871baab06": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b7713310a814991aec7929fa715ec7c", + "max": 2688263, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_fcc8254622324f8ea965e12e4d4966cd", + "value": 2688263 + } + }, + "c5670295387a4c199571a2a21a6b69dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5718d031b9942f4b8bf331a8543db29": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_35d862a4f00c4493920da3e2eb92b043", + "IPY_MODEL_16b464f168d844cba5eb0c91ab4fb91c", + "IPY_MODEL_af5231ecf6e2489b80cdcd435b5e3451" + ], + "layout": "IPY_MODEL_62a0f83cf75d4c59a0601c5ad3a817a7" + } + }, + "c580d3a6e99e48fab09b3ce799711802": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4afba780d0f244548a7f28db15b41dc9", + "IPY_MODEL_4e3d482feec9485590d277dfc1d0b3d3", + "IPY_MODEL_23436ea247dd43d8829ca143a49637c5" + ], + "layout": "IPY_MODEL_9609eaf0792345b2ab457cb7188ee14a" + } + }, + "c6ce0e9bdd90400f9cf2debf9165758c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cb3ab56aa43e4b94b978764caa6057a7", + "placeholder": "​", + "style": "IPY_MODEL_2e24b7250ee04fbb810e5d6ade107c51", + "value": " 10.6k/10.6k [01:17<?, ?B/s]" + } + }, + "c778798c234d45b5a4ae2f250e3706f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c8267c689fb14afc9a8eb3ecb6f4fd4c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fc9d0c314ca14826993fe1f24b070b5d", + "placeholder": "​", + "style": "IPY_MODEL_bc71a433928e4870b56a3d81e35e6351", + "value": "Download file training_args.bin: 100%" + } + }, + "c910ae80ec1a4718915e9a861215f27c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_245c5418ca084fb6bc0b027576a1f789", + "IPY_MODEL_d0bc0e6038eb46dbbc5f5593d4c285ca", + "IPY_MODEL_a450c318d99a477c9f7341458ad4bc8d" + ], + "layout": "IPY_MODEL_02ac19466e24404a92e769ed60604881" + } + }, + "cab6d36980c0423fb75299c09c33facc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_15bd2dcdbf4b4e74b9db09bdb8822e61", + "placeholder": "​", + "style": "IPY_MODEL_ecf73dd75420460399bfd04d8cd81f90", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "cb3ab56aa43e4b94b978764caa6057a7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cb52fa97c659430a8bd71dcd76245a7f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "cbc0ba8e49a740fcae7b94fe7edb8107": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fcc7ad16a0b14d96acd9be9e03ac6af9", + "placeholder": "​", + "style": "IPY_MODEL_1a08961f063346ccae206a863ab7df6b", + "value": "Upload 1 LFS files: 100%" + } + }, + "cc59f6643acb4054ad6df56e90d3d2a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8a0a77b9ebd74caabb8f8a764c289a5c", + "max": 502, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d8ac6df8420a423eb048b4db04c8925c", + "value": 502 + } + }, + "ccd34ccf2c864c609a0b4fcee7327b31": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cd5b2433cc404ac7b1bb35c6a55f6874": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_851fb5ac25db4bb287a6dbe948278eec", + "max": 4996278331, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_471d44c8e49e42b89302ef53ab0eb316", + "value": 4996278331 + } + }, + "cde9d5cbadf14a5abe294dba0fa5bd2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f1a325b02f54352a0b412d7f4420bbb", + "placeholder": "​", + "style": "IPY_MODEL_e8c2cfdaf0eb413189d93924eae757c7", + "value": "Clean file runs/Feb07_02-43-38_319afa680fd7/events.out.tfevents.1675737843.319afa680fd7.7189.0: 100%" + } + }, + "ce16ac2b3ff244e6bd7dd58daa9f4f7f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce18faf7b68140a3a8247330b356e05b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ce4b6a4b6fec4ceb907fa436ff940bd2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cf024daa51f74777b98028df10dbc9c5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf815c0979644cd6ad2c681fa96c0648": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6940a405215c4e2caadbe209c677bde0", + "placeholder": "​", + "style": "IPY_MODEL_b21331417d084aba80f919b71933bc2c", + "value": " 3.50k/3.50k [01:17<00:00, 33.0B/s]" + } + }, + "cfab815edc1f42898b656c0f4a3b366b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cfd59ddfe85f4585865df8df47fd491f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bcef9cf2b00c46878f07c48875f7d194", + "placeholder": "​", + "style": "IPY_MODEL_47659b15eb284f06bf9735ca2e425646", + "value": " 4.20k/4.20k [00:00<00:00, 321kB/s]" + } + }, + "d04c1c4d04fc4928b4a2a0e860f996e0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d0bc0e6038eb46dbbc5f5593d4c285ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c038ffcc1dc4e3fbfed17d94327353a", + "max": 10824, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_91f6edc592394a0bad250e68d3c22017", + "value": 10824 + } + }, + "d0c95a20c2664c149886b72fa665d3cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d10f2e9c25f2417f9728aa8e43acf677": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d1ff50e1b871429a85df8cf10e73ffb1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d2469e1f1daf4d4cb0faf35ce90f6445": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d25f3ebb577749d89e2e6d2a72f6ca5f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d29e3b9102f14f3385e47ae6e27d1ab1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fc612aaed5644b84959a1958b0240dda", + "placeholder": "​", + "style": "IPY_MODEL_36c8300bcbb84627a03b94f0eea86ce9", + "value": " 5.56k/5.56k [00:00<00:00, 399kB/s]" + } + }, + "d4aa1670fdab463bb0a0e6fe104988bc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5b95aa9cab446f88d61e9f4a25a8e2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83d6fbf463264c71a4ec8775e26c7c38", + "max": 345949677, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f02443fbda394fefa162f4ff5b2d2ce7", + "value": 345949677 + } + }, + "d5c5396ea2f54ff0aeb9be58b59c253b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "d67dc70cfc9246f79a59261a69b28b41": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d68194cf7d264df7820f27eb4d070de2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d9329cb3c1704691b6a36c293bcbf41b", + "placeholder": "​", + "style": "IPY_MODEL_dfa468dd89174d97bcaabbda0ed8e117", + "value": " 3.50k/3.50k [01:17<?, ?B/s]" + } + }, + "d7136a7b3d0040d580508fc665b9fb00": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9102cc38ee9942ac91dc66eda069ddcb", + "placeholder": "​", + "style": "IPY_MODEL_416c65eedcea4a6ea69dae317de79bca", + "value": "Downloading builder script: 100%" + } + }, + "d7c394bc6a3249e9b3fcbae2ebd25eb7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d7d43177c750412cb1522eb08c01d2d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9013fd35e17f44bfb7a068833adaf167", + "placeholder": "​", + "style": "IPY_MODEL_a849dcc9c7f742d49c874597d8c693c5", + "value": "Downloading data: 100%" + } + }, + "d8ac6df8420a423eb048b4db04c8925c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d9329cb3c1704691b6a36c293bcbf41b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d947ec84b16c4781959427b610328ab9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9c15769da2b49e4b67d43d95be30cd5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4d1f6114d4034f758bf8cc35485e0056", + "IPY_MODEL_223a13f77e2e49a09660890eb4213b30", + "IPY_MODEL_1639075b181f4945ac32af116b22d1d7" + ], + "layout": "IPY_MODEL_ad6adbe84ac940ffbf89017a269a3e75" + } + }, + "da13543779034424aaf6f5c4a96f0457": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2d13b401dcf94089a4a78a62f05bdce3", + "IPY_MODEL_dbc00727fa1c4e00aacf627c04527649", + "IPY_MODEL_e0d98c36e5d242b2905adf8167ac348a" + ], + "layout": "IPY_MODEL_483b46ed1e8148498d54e4d6f4c0ca8d" + } + }, + "da87efdf06d74b0aba268320ba7882f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a100435005a34d428b9ae615f49bb1a1", + "placeholder": "​", + "style": "IPY_MODEL_8886c333aa104900a3bb4a1904756661", + "value": " 160/160 [00:00<00:00, 8.36kB/s]" + } + }, + "db6b68a237cf4e93ae6383448b773e47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dbaa70ad4f1d4496a670601fe447116d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dbc00727fa1c4e00aacf627c04527649": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3efccb526dec44bf9801ac13dcc1068d", + "max": 5773, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8ecde04d15ab47f9b78d561615ca567d", + "value": 5773 + } + }, + "dc078f0db3e54199bef0c11ee5e6297e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dca7d0a0d2aa479083d81a54489d3717": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dcd1c1f4fc014c4aa9ebdaf3c533a061": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_26520bc6555d41d9951ea0219dc4b5d7", + "max": 75750, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_60472b5a360f43e89e39d641dabba57b", + "value": 75750 + } + }, + "dcefc9ba538e4da2b75f9372a4c5b5bf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd38a658218d42d7b051c66de4d4180a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "dd4edb4de5e14dfbbee418dba0bb3573": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ddf88cbfaaef4a55babf480816db7d28": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de92f68231294aefb249f400475bc9a4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dfa468dd89174d97bcaabbda0ed8e117": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dfd2baceac524fe29c0f4a8443b60a71": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f34be236ef9c42448ecf2957160990f7", + "placeholder": "​", + "style": "IPY_MODEL_38deee504dab482983a8b8f340472282", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "dfe97442852c4338843c65333b25623d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dffe636233c84dcd9d75f34baf40fa1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0d98c36e5d242b2905adf8167ac348a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ddf88cbfaaef4a55babf480816db7d28", + "placeholder": "​", + "style": "IPY_MODEL_38f30da546444f8199673003d0a92dda", + "value": " 5.64k/5.64k [01:17<?, ?B/s]" + } + }, + "e0f2599ed04c424f896e503630034e84": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7e243f4a30c645b080e688fb706b4548", + "placeholder": "​", + "style": "IPY_MODEL_db6b68a237cf4e93ae6383448b773e47", + "value": " 1/1 [00:14<00:00, 14.25s/it]" + } + }, + "e302923a9df24e5fa8fff79c203ead9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3047557ae7f40e2aecccf1afad36f3f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e33243b001274d02a25f5940ba41ecf6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_40a5a50aeca24f0d8990da97971004d1", + "max": 2688263, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_be71e6438e0e49759d2f72feec520cae", + "value": 2688263 + } + }, + "e4074e524a19455fab810ec454fe8bf1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4694cffcb574863a255e9022c8ddf5d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e4a4122ff32a41a1917459709224fc6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8ba66e043f8a4975bd77ecd343401260", + "IPY_MODEL_2e0bb2dcd85640d7b85d80469ea9f9f3", + "IPY_MODEL_1c208beced884b9291c5bcb7b4f71680" + ], + "layout": "IPY_MODEL_3f188d6d34774154afc297b13a3eb9e8" + } + }, + "e54b7fc2f9b94118ab97f2736862f77d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e667b14a3c0e41c6a16c4be453f10378": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e6ce3e626b1744c7ba3da26d1fde5fa5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6e36d744e1244aeb7eb0c4ce392372d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e7144551e74b46529b00a61f580a183d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e8026bcb0e2c4b14bc6c84537c8c4ae9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e83fd078f467406da0baf26e18b39e89": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9b4b67731a7a4bc59be132b53c24eae8", + "IPY_MODEL_e33243b001274d02a25f5940ba41ecf6", + "IPY_MODEL_06e4c619e366427a8ff4c358196ecd12" + ], + "layout": "IPY_MODEL_bacd429b42d843299cb75224db3afb1e" + } + }, + "e88c3ad56ef24e4d8281898b08ff6f4b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1fdc59cbb8724c618ce6e586e2c9723f", + "placeholder": "​", + "style": "IPY_MODEL_dbaa70ad4f1d4496a670601fe447116d", + "value": " 5.64k/5.64k [01:17<?, ?B/s]" + } + }, + "e8c2cfdaf0eb413189d93924eae757c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ea2217bba8574c7890a411f27da0c147": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b1b6922df40c4af69b00b4e85db770c4", + "IPY_MODEL_2b8bc04ac3104592bf950e349c034c2d", + "IPY_MODEL_cf815c0979644cd6ad2c681fa96c0648" + ], + "layout": "IPY_MODEL_7541b2304cc5466cb2369c0025d2d243" + } + }, + "eaf2c76a172d4da6846c6face18a3b58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "eb864284052c46b28b93fc79bfed740f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "eca3b1f4ad76430483a221470e592c13": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e302923a9df24e5fa8fff79c203ead9f", + "placeholder": "​", + "style": "IPY_MODEL_f7b9abca32ec42edad5ec6e52882f732", + "value": " 5.64k/5.64k [01:17<00:00, 61.4B/s]" + } + }, + "ecf73dd75420460399bfd04d8cd81f90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "edb0d1ba5e114af9b6705969f58ece7b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "edc0742a08a445a594139200c7f03c60": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ee103846621b4c0e8e1266599b99f6ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e7144551e74b46529b00a61f580a183d", + "placeholder": "​", + "style": "IPY_MODEL_9b1bfa11ee3746c38155c4505abfaa86", + "value": "Generating train split: 100%" + } + }, + "eef81e9bea0c4f5d85e7efa8ebe0463a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_c778798c234d45b5a4ae2f250e3706f9", + "style": "IPY_MODEL_d5c5396ea2f54ff0aeb9be58b59c253b", + "tooltip": "" + } + }, + "ef7c7fe37c8d459da6d20f4ccbea3fb8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4ee1fde44dcf49eda97e1a05173e5bb1", + "placeholder": "​", + "style": "IPY_MODEL_a0929e66406644dbb09bbdc9c58d488d", + "value": "Downloading (…)rocessor_config.json: 100%" + } + }, + "f02443fbda394fefa162f4ff5b2d2ce7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f031aaf7fbc648a7b8a2e5faf37df14d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f0b0cad40fbd461ca7bdcdbb5f442f57": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_76cf84387a7c43608ad018188eef4114", + "IPY_MODEL_68ef0c8550ee4c00aa8b284d48572610", + "IPY_MODEL_58e7f5c36d8b4836a868ce89838f1896" + ], + "layout": "IPY_MODEL_9b216287b8694bcc9960a356adf15504" + } + }, + "f22598cf4ade4427a1b437fd45aabcc4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f34be236ef9c42448ecf2957160990f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f3991aaad13a4c50a7809483b7907b7b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4a118fa87e424664a2d2ed7c7f58f3fd", + "max": 5773, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_852b01d8592b4d8aa2c4297d6cf75f78", + "value": 5773 + } + }, + "f5041033ddf94f459ed8d1747f6b2d6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f544720498e44c49add78550b46edb3a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9281c5aec5b84411a05e4762125388d9", + "max": 5777, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f5b5d6ace35a4a82bfcf2549b93c8558", + "value": 5777 + } + }, + "f5b5d6ace35a4a82bfcf2549b93c8558": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f6a9243d46cb4c0fbdf3f80f7074f6c5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7b9abca32ec42edad5ec6e52882f732": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f96302d0c2d849c5b5a0206b65e461ab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f981fb4aae504045aa10889dceeb6cac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fc612aaed5644b84959a1958b0240dda": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fc9d0c314ca14826993fe1f24b070b5d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fcc7ad16a0b14d96acd9be9e03ac6af9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fcc8254622324f8ea965e12e4d4966cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fd9df81594724b88b54b4e3e1b19370a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c8267c689fb14afc9a8eb3ecb6f4fd4c", + "IPY_MODEL_adb09cebab13484a8d75a338eaba7b0c", + "IPY_MODEL_d68194cf7d264df7820f27eb4d070de2" + ], + "layout": "IPY_MODEL_8bf8a843d65142bbad81de74aa8573f6" + } + }, + "fdb3673fdbf24468a9965f13196b78ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0b82dbc29d514f4e9e012fd755948e52", + "IPY_MODEL_af1a42626ba7452189fbb5987b159b9c", + "IPY_MODEL_93788683ef8e4c71bc1c0b3b9cc7219c" + ], + "layout": "IPY_MODEL_6e4983016e4f465b85ab7a472d0e986e" + } + }, + "fdf282b234fe4a1a8ab452ac04511b7d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_59792e1ee7074f998d5d4494c09061c6", + "IPY_MODEL_cd5b2433cc404ac7b1bb35c6a55f6874", + "IPY_MODEL_7c1b6f271fff4d60be39d291c73bfb75" + ], + "layout": "IPY_MODEL_074f38bd3a9d49719188e8860fb1b5d3" + } + }, + "fe93399cc15f4f29b6a37f6a65cf8c9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fea27a80cd2f4b4dba84ecdfefd2722c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fee3db0deefb410db4c572efd95575bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fee4fba960ac41ed97984467da41f319": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ee103846621b4c0e8e1266599b99f6ee", + "IPY_MODEL_dcd1c1f4fc014c4aa9ebdaf3c533a061", + "IPY_MODEL_a29d758fb7f147c7ad1108f140caf23a" + ], + "layout": "IPY_MODEL_cb52fa97c659430a8bd71dcd76245a7f" + } + }, + "ff39519704b64e68b69ec06aea02791e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dcefc9ba538e4da2b75f9372a4c5b5bf", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_77df794cb4e4491e80ee20bbd2801a89", + "value": 1 + } + }, + "ffc13c11355b46bb9cafcb17f3e1535e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/peft/examples/image_classification/image_classification_timm_peft_lora.ipynb b/peft/examples/image_classification/image_classification_timm_peft_lora.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1e2f2225055ef8d0542d66b77d089806ae9015cd --- /dev/null +++ b/peft/examples/image_classification/image_classification_timm_peft_lora.ipynb @@ -0,0 +1,744 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4ef57047", + "metadata": {}, + "source": [ + "# Using PEFT with timm" + ] + }, + { + "cell_type": "markdown", + "id": "80561acc", + "metadata": {}, + "source": [ + "`peft` allows us to train any model with LoRA as long as the layer type is supported. Since `Conv2D` is one of the supported layer types, it makes sense to test it on image models.\n", + "\n", + "In this short notebook, we will demonstrate this with an image classification task using [`timm`](https://huggingface.co/docs/timm/index)." + ] + }, + { + "cell_type": "markdown", + "id": "aa26c285", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "markdown", + "id": "552b9040", + "metadata": {}, + "source": [ + "Make sure that you have the latest version of `peft` installed. To ensure that, run this in your Python environment:\n", + " \n", + " python -m pip install --upgrade peft\n", + " \n", + "Also, ensure that `timm` is installed:\n", + "\n", + " python -m pip install --upgrade timm" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e600b7d5", + "metadata": {}, + "outputs": [], + "source": [ + "import timm\n", + "import torch\n", + "from PIL import Image\n", + "from timm.data import resolve_data_config\n", + "from timm.data.transforms_factory import create_transform" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "73a2ae54", + "metadata": {}, + "outputs": [], + "source": [ + "import peft\n", + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "82c628fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "701ab69c", + "metadata": {}, + "source": [ + "## Loading the pre-trained base model" + ] + }, + { + "cell_type": "markdown", + "id": "20bff51a", + "metadata": {}, + "source": [ + "We use a small pretrained `timm` model, `PoolFormer`. Find more info on its [model card](https://huggingface.co/timm/poolformer_m36.sail_in1k)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "495cb3d6", + "metadata": {}, + "outputs": [], + "source": [ + "model_id_timm = \"timm/poolformer_m36.sail_in1k\"" + ] + }, + { + "cell_type": "markdown", + "id": "2dc06f9b", + "metadata": {}, + "source": [ + "We tell `timm` that we deal with 3 classes, to ensure that the classification layer has the correct size." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "090564bc", + "metadata": {}, + "outputs": [], + "source": [ + "model = timm.create_model(model_id_timm, pretrained=True, num_classes=3)" + ] + }, + { + "cell_type": "markdown", + "id": "beca5794", + "metadata": {}, + "source": [ + "These are the transformations steps necessary to process the image." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9df2e113", + "metadata": {}, + "outputs": [], + "source": [ + "transform = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))" + ] + }, + { + "cell_type": "markdown", + "id": "3f809dfa", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "id": "a398fe22", + "metadata": {}, + "source": [ + "For this exercise, we use the \"beans\" dataset. More details on the dataset can be found on [its datasets page](https://huggingface.co/datasets/beans). For our purposes, what's important is that we have image inputs and the target we're trying to predict is one of three classes for each image." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0fddc704", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset beans (/home/vinh/.cache/huggingface/datasets/beans/default/0.0.0/90c755fb6db1c0ccdad02e897a37969dbf070bed3755d4391e269ff70642d791)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "05592574da474b81ab736d6babb5e19d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_train[0][\"image\"]" + ] + }, + { + "cell_type": "markdown", + "id": "880ea6c4", + "metadata": {}, + "source": [ + "We define a small processing function which is responsible for loading and transforming the images, as well as extracting the labels." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "142df842", + "metadata": {}, + "outputs": [], + "source": [ + "def process(batch):\n", + " x = torch.cat([transform(img).unsqueeze(0) for img in batch[\"image\"]])\n", + " y = torch.tensor(batch[\"labels\"])\n", + " return {\"x\": x, \"y\": y}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9744257b", + "metadata": {}, + "outputs": [], + "source": [ + "ds_train.set_transform(process)\n", + "ds_valid.set_transform(process)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "282374be", + "metadata": {}, + "outputs": [], + "source": [ + "train_loader = torch.utils.data.DataLoader(ds_train, batch_size=32)\n", + "valid_loader = torch.utils.data.DataLoader(ds_valid, batch_size=32)" + ] + }, + { + "cell_type": "markdown", + "id": "5dcd3329", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "id": "969bc374", + "metadata": {}, + "source": [ + "This is just a function that performs the train loop, nothing fancy happening." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b9fc9588", + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, optimizer, criterion, train_dataloader, valid_dataloader, epochs):\n", + " for epoch in range(epochs):\n", + " model.train()\n", + " train_loss = 0\n", + " for batch in train_dataloader:\n", + " xb, yb = batch[\"x\"], batch[\"y\"]\n", + " xb, yb = xb.to(device), yb.to(device)\n", + " outputs = model(xb)\n", + " lsm = torch.nn.functional.log_softmax(outputs, dim=-1)\n", + " loss = criterion(lsm, yb)\n", + " train_loss += loss.detach().float()\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + "\n", + " model.eval()\n", + " valid_loss = 0\n", + " correct = 0\n", + " n_total = 0\n", + " for batch in valid_dataloader:\n", + " xb, yb = batch[\"x\"], batch[\"y\"]\n", + " xb, yb = xb.to(device), yb.to(device)\n", + " with torch.no_grad():\n", + " outputs = model(xb)\n", + " lsm = torch.nn.functional.log_softmax(outputs, dim=-1)\n", + " loss = criterion(lsm, yb)\n", + " valid_loss += loss.detach().float()\n", + " correct += (outputs.argmax(-1) == yb).sum().item()\n", + " n_total += len(yb)\n", + "\n", + " train_loss_total = (train_loss / len(train_dataloader)).item()\n", + " valid_loss_total = (valid_loss / len(valid_dataloader)).item()\n", + " valid_acc_total = correct / n_total\n", + " print(f\"{epoch=:<2} {train_loss_total=:.4f} {valid_loss_total=:.4f} {valid_acc_total=:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "3fd58357", + "metadata": {}, + "source": [ + "### Selecting which layers to fine-tune with LoRA" + ] + }, + { + "cell_type": "markdown", + "id": "7987321c", + "metadata": {}, + "source": [ + "Let's take a look at the layers of our model. We only print the first 30, since there are quite a few:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "55a7be4d", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('', timm.models.metaformer.MetaFormer),\n", + " ('stem', timm.models.metaformer.Stem),\n", + " ('stem.conv', torch.nn.modules.conv.Conv2d),\n", + " ('stem.norm', torch.nn.modules.linear.Identity),\n", + " ('stages', torch.nn.modules.container.Sequential),\n", + " ('stages.0', timm.models.metaformer.MetaFormerStage),\n", + " ('stages.0.downsample', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks', torch.nn.modules.container.Sequential),\n", + " ('stages.0.blocks.0', timm.models.metaformer.MetaFormerBlock),\n", + " ('stages.0.blocks.0.norm1', timm.layers.norm.GroupNorm1),\n", + " ('stages.0.blocks.0.token_mixer', timm.models.metaformer.Pooling),\n", + " ('stages.0.blocks.0.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d),\n", + " ('stages.0.blocks.0.drop_path1', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks.0.layer_scale1', timm.models.metaformer.Scale),\n", + " ('stages.0.blocks.0.res_scale1', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks.0.norm2', timm.layers.norm.GroupNorm1),\n", + " ('stages.0.blocks.0.mlp', timm.layers.mlp.Mlp),\n", + " ('stages.0.blocks.0.mlp.fc1', torch.nn.modules.conv.Conv2d),\n", + " ('stages.0.blocks.0.mlp.act', torch.nn.modules.activation.GELU),\n", + " ('stages.0.blocks.0.mlp.drop1', torch.nn.modules.dropout.Dropout),\n", + " ('stages.0.blocks.0.mlp.norm', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks.0.mlp.fc2', torch.nn.modules.conv.Conv2d),\n", + " ('stages.0.blocks.0.mlp.drop2', torch.nn.modules.dropout.Dropout),\n", + " ('stages.0.blocks.0.drop_path2', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks.0.layer_scale2', timm.models.metaformer.Scale),\n", + " ('stages.0.blocks.0.res_scale2', torch.nn.modules.linear.Identity),\n", + " ('stages.0.blocks.1', timm.models.metaformer.MetaFormerBlock),\n", + " ('stages.0.blocks.1.norm1', timm.layers.norm.GroupNorm1),\n", + " ('stages.0.blocks.1.token_mixer', timm.models.metaformer.Pooling),\n", + " ('stages.0.blocks.1.token_mixer.pool', torch.nn.modules.pooling.AvgPool2d)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(n, type(m)) for n, m in model.named_modules()][:30]" + ] + }, + { + "cell_type": "markdown", + "id": "09af9349", + "metadata": {}, + "source": [ + "Most of these layers are not good targets for LoRA, but we see a couple that should interest us. Their names are `'stages.0.blocks.0.mlp.fc1'`, etc. With a bit of regex, we can match them easily.\n", + "\n", + "Also, we should inspect the name of the classification layer, since we want to train that one too!" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8b98d9ef", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('head.global_pool.flatten', torch.nn.modules.linear.Identity),\n", + " ('head.norm', timm.layers.norm.LayerNorm2d),\n", + " ('head.flatten', torch.nn.modules.flatten.Flatten),\n", + " ('head.drop', torch.nn.modules.linear.Identity),\n", + " ('head.fc', torch.nn.modules.linear.Linear)]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(n, type(m)) for n, m in model.named_modules()][-5:]" + ] + }, + { + "cell_type": "markdown", + "id": "00e75b78", + "metadata": {}, + "source": [ + " config = peft.LoraConfig(\n", + " r=8,\n", + " target_modules=r\".*\\.mlp\\.fc\\d|head\\.fc\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "23814d70", + "metadata": {}, + "source": [ + "Okay, this gives us all the information we need to fine-tune this model. With a bit of regex, we match the convolutional layers that should be targeted for LoRA. We also want to train the classification layer `'head.fc'` (without LoRA), so we add it to the `modules_to_save`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "81029587", + "metadata": {}, + "outputs": [], + "source": [ + "config = peft.LoraConfig(r=8, target_modules=r\".*\\.mlp\\.fc\\d\", modules_to_save=[\"head.fc\"])" + ] + }, + { + "cell_type": "markdown", + "id": "e05876bc", + "metadata": {}, + "source": [ + "Finally, let's create the `peft` model, the optimizer and criterion, and we can get started. As shown below, less than 2% of the model's total parameters are updated thanks to `peft`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cc5c5db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 1,064,454 || all params: 56,467,974 || trainable%: 1.88505789139876\n" + ] + } + ], + "source": [ + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "peft_model = peft.get_peft_model(model, config).to(device)\n", + "optimizer = torch.optim.Adam(peft_model.parameters(), lr=2e-4)\n", + "criterion = torch.nn.CrossEntropyLoss()\n", + "peft_model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9e557e42", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0 train_loss_total=1.2999 valid_loss_total=1.0624 valid_acc_total=0.4436\n", + "epoch=1 train_loss_total=1.0200 valid_loss_total=0.8906 valid_acc_total=0.7594\n", + "epoch=2 train_loss_total=0.8874 valid_loss_total=0.6894 valid_acc_total=0.8045\n", + "epoch=3 train_loss_total=0.7440 valid_loss_total=0.4797 valid_acc_total=0.8045\n", + "epoch=4 train_loss_total=0.6025 valid_loss_total=0.3419 valid_acc_total=0.8120\n", + "epoch=5 train_loss_total=0.4820 valid_loss_total=0.2589 valid_acc_total=0.8421\n", + "epoch=6 train_loss_total=0.3567 valid_loss_total=0.2101 valid_acc_total=0.8722\n", + "epoch=7 train_loss_total=0.2835 valid_loss_total=0.1385 valid_acc_total=0.9098\n", + "epoch=8 train_loss_total=0.1815 valid_loss_total=0.1108 valid_acc_total=0.9474\n", + "epoch=9 train_loss_total=0.1341 valid_loss_total=0.0785 valid_acc_total=0.9699\n", + "CPU times: user 4min 3s, sys: 36.3 s, total: 4min 40s\n", + "Wall time: 3min 32s\n" + ] + } + ], + "source": [ + "%time train(peft_model, optimizer, criterion, train_loader, valid_dataloader=valid_loader, epochs=10)" + ] + }, + { + "cell_type": "markdown", + "id": "94162859", + "metadata": {}, + "source": [ + "We get an accuracy of ~0.97, despite only training a tiny amount of parameters. That's a really nice result." + ] + }, + { + "cell_type": "markdown", + "id": "9c16bad8", + "metadata": {}, + "source": [ + "## Sharing the model through Hugging Face Hub" + ] + }, + { + "cell_type": "markdown", + "id": "2e1e16c7", + "metadata": {}, + "source": [ + "### Pushing the model to Hugging Face Hub" + ] + }, + { + "cell_type": "markdown", + "id": "ec596b3b", + "metadata": {}, + "source": [ + "If we want to share the fine-tuned weights with the world, we can upload them to Hugging Face Hub like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b583579d", + "metadata": {}, + "outputs": [], + "source": [ + "user = \"BenjaminB\" # put your user name here\n", + "model_name = \"peft-lora-with-timm-model\"\n", + "model_id = f\"{user}/{model_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f1db67e4", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aed1f9c3fa334be1b5f208efe5ba27e6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00\n", + " \n", + " \n", + " [255/255 06:13, Epoch 1/1]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss
1No log0.017228

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving model checkpoint to temp/checkpoint-100\n", + "Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "Saving model checkpoint to temp/checkpoint-200\n", + "Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "***** Running Evaluation *****\n", + " Num examples = 227\n", + " Batch size = 8\n", + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=255, training_loss=0.2569344015682445, metrics={'train_runtime': 377.3565, 'train_samples_per_second': 5.398, 'train_steps_per_second': 0.676, 'total_flos': 1181084919791616.0, 'train_loss': 0.2569344015682445, 'epoch': 1.0})" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "r98VtofiGXtO", + "metadata": { + "id": "r98VtofiGXtO" + }, + "source": [ + "## Qualitatively test our model" + ] + }, + { + "cell_type": "markdown", + "id": "NIm7z3UNzGPP", + "metadata": { + "id": "NIm7z3UNzGPP" + }, + "source": [ + "Let's have a quick qualitative evaluation of the model, by taking a sample from the dataset that corresponds to a positive label. Run your generation similarly as you were running your model from `transformers`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c95d6173", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c95d6173", + "outputId": "ed03a1dc-597a-4053-99d6-eca2cc6da253" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generate config GenerationConfig {\n", + " \"_from_model_config\": true,\n", + " \"decoder_start_token_id\": 0,\n", + " \"eos_token_id\": 1,\n", + " \"pad_token_id\": 0,\n", + " \"transformers_version\": \"4.27.0.dev0\",\n", + " \"use_cache\": false\n", + "}\n", + "\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "/usr/local/lib/python3.8/dist-packages/transformers/generation/utils.py:1374: UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cpu, whereas the model is on cuda. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('cuda') before running `.generate()`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input sentence: In January-September 2009 , the Group 's net interest income increased to EUR 112.4 mn from EUR 74.3 mn in January-September 2008 .\n", + " output prediction: ['positive']\n" + ] + } + ], + "source": [ + "model.eval()\n", + "input_text = \"In January-September 2009 , the Group 's net interest income increased to EUR 112.4 mn from EUR 74.3 mn in January-September 2008 .\"\n", + "inputs = tokenizer(input_text, return_tensors=\"pt\").to(model.device)\n", + "\n", + "outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10)\n", + "\n", + "print(\"input sentence: \", input_text)\n", + "print(\" output prediction: \", tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))" + ] + }, + { + "cell_type": "markdown", + "id": "9QqBlwzoGZ3f", + "metadata": { + "id": "9QqBlwzoGZ3f" + }, + "source": [ + "## Share your adapters on 🤗 Hub" + ] + }, + { + "cell_type": "markdown", + "id": "NT-C8SjcKqUx", + "metadata": { + "id": "NT-C8SjcKqUx" + }, + "source": [ + "Once you have trained your adapter, you can easily share it on the Hub using the method `push_to_hub` . Note that only the adapter weights and config will be pushed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcbfa1f9", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 359, + "referenced_widgets": [ + "5bb29f3102954b06bec825f6b3a7aaa7", + "90516032070a40979181d1d27db10c4f", + "4b7dc0fb222b4e2a9bb2ef2501e9fd30", + "06069855ef82484f9985e4619095dbe8", + "1ece69c53e37413caad8db70d9160ad5", + "7ce90db727ea47cc9344176858a2225b", + "64f2b70b63cd4e7eb9e22ac2de5589c9", + "57dea1b3e04142bb91868a474774d86a", + "8ac43334e0ad4a78acda3b876fead058", + "04da98e400514cf2847d172916cd0081", + "4dbe49547fe94010ad5a30818cfc35bc", + "99091ca45c1b4809ba0a1b01af85f528", + "06ec124c3dac4fe6b152fb812d20d86d", + "7561c47a97444666816422a0418e1675", + "45ab5d7049e34dfd8a067643ae887a31", + "434e308cac5847f0bee431c7dbb4c04a", + "dd993a4a7cdf40448098544c95468a10" + ] + }, + "id": "bcbfa1f9", + "outputId": "91ef770e-9fc4-4eb2-b02b-24e635101f97" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token is valid.\n", + "Your token has been saved in your configured git credential helpers (store).\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rFKJ4vHNGkJw", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 152, + "referenced_widgets": [ + "00f7d043cb184d69b828c204dac2c0ab", + "1e409cd3d3a04b558d989d63f0b3b5f7", + "d48cbb38078b456fab1634bec5b0a1ba", + "9030744dbca9427ba8a036a76b5c8bf7", + "53dd4444c0e14e16a912532898b32d92", + "5287ac638c22412ab91c55f3316c9b63", + "a43ddb478f044f17adbcfae841ec2114", + "45141234ce584f208a9d301faadf75d2", + "f62ffbdc24734b999f36058d9edca81f", + "972ddebd536d4685bfc3c7c13e5bd8be", + "64156e2c54b44fb9aec661d9b57da962", + "050de732f51f4af8bb41ab3cad0090a4", + "7960ed3beb2a429ba2aca1c6ed032f64", + "726a2eedc7434210bc5aa4d0a772b313", + "07bf5d621cf944258aaf13954669df56", + "93a0896ca66b4111bc4cabe6e1278440", + "cfc78731f7d543ce8529cc254d92ddf5", + "eacf8e9ed6e847faae2b8ecab283ddc4", + "bef1971d92e6479696e3f9a27a757b8a", + "821f2f296acb40ae9bb40fc3faf4103d", + "11efd993475a4f2aabe7df605bab04dd", + "406e4d8561f64d2a94d93a606d02d7d3" + ] + }, + "id": "rFKJ4vHNGkJw", + "outputId": "07425379-64ad-47e8-ba8f-8d9dc26252b6" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading the following files to ybelkada/flan-t5-large-lora: adapter_model.bin,adapter_config.json\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "00f7d043cb184d69b828c204dac2c0ab", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00Pro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "7cfda0921e5a4f378e90e057447f3b3d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_10aa4e3aca57438ea7af97b60208ac81", + "placeholder": "​", + "style": "IPY_MODEL_f14657da8e1e4298a96e3885eb4eee93", + "value": "Downloading builder script: 100%" + } + }, + "7fb7e3e2c75d4d03a98e581d4ead0f00": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_43d7a9b421be430286b5eb8441d6d465", + "max": 2201, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ebc26228160046c48279d71770c928d8", + "value": 2201 + } + }, + "8035fd17e29a48d7b415c531607216a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "808a41f78a7c4ae0b6aafee59c6234ae": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "808cea6c94264f0c9990d6dbcf538419": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "81d2f0953e104fc1ad57295819b6b689": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bcda86e43607436583f1fbfee08a9786", + "placeholder": "​", + "style": "IPY_MODEL_55cf3bcee7c745948b39eea5f65fc62b", + "value": " 3.13G/3.13G [00:18<00:00, 182MB/s]" + } + }, + "821f2f296acb40ae9bb40fc3faf4103d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8252a05cb70b46ec8b0480062ea1cb71": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8669a890db6c456cbc3ada28976be30b", + "placeholder": "​", + "style": "IPY_MODEL_9e5afa2048c74754816b34a34171fcb0", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "82849bb4d5da452e87a18ca749ce5d7b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2b4d68606bdf4758b812f5a8057af595", + "placeholder": "​", + "style": "IPY_MODEL_f9620e01cd6749f88b722a42ff68c502", + "value": "Downloading (…)"spiece.model";: 100%" + } + }, + "83f196eb5d9549cda4d48008fa7b1386": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "85ae7ed1ec244a89aeb9f4552c2c9462": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "85e5c1a9b7ac4e6e884213a636d0aaa1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8669a890db6c456cbc3ada28976be30b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87ce7c58b18146f3ac73970d7f8079ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8a2bd1b4d9ba47ef9e77048e3d2d1e83": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ac43334e0ad4a78acda3b876fead058": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8b41d2e9f7424dc898446e7f428dc757": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d35a041dbfb4747aea427e76890551a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_74efd6bfe71e4dc599a7fc76574ff154", + "IPY_MODEL_1ed1bfefa6534085869130ea533ff4b1", + "IPY_MODEL_fd08c4fbe5d84dd893d87a5e2f2d082d" + ], + "layout": "IPY_MODEL_87ce7c58b18146f3ac73970d7f8079ac" + } + }, + "8d79b7d0c3cb4f8d99fb20941c35856f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ef94158b0584f0eb55582bf8b6594c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8f339c9070f046dab46ebc35c1cc2dba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d618ec6be7d14a239b3bc74172616bf2", + "max": 13677, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3c8349539946412a93a51d9087306ea4", + "value": 13677 + } + }, + "8fceec1018574003884e082b2a5c23bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9030744dbca9427ba8a036a76b5c8bf7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_972ddebd536d4685bfc3c7c13e5bd8be", + "placeholder": "​", + "style": "IPY_MODEL_64156e2c54b44fb9aec661d9b57da962", + "value": " 1/1 [00:02<00:00, 2.11s/it]" + } + }, + "90516032070a40979181d1d27db10c4f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57dea1b3e04142bb91868a474774d86a", + "placeholder": "​", + "style": "IPY_MODEL_8ac43334e0ad4a78acda3b876fead058", + "value": "


Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "924e6a8308fc47af929aca1987a12f09": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "93a0896ca66b4111bc4cabe6e1278440": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "93fe5a8fafbc44b496309d1a8da77ac5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "94437f56e5a44fa3bb08c9d798b2eaeb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0f091d25adf34ade835b094eb5b952a3", + "IPY_MODEL_9d599c2a4d9f4f2db1e4b3183c18eb94", + "IPY_MODEL_aa91ad725da147bc8cab70f931d82672" + ], + "layout": "IPY_MODEL_fb6877c376e0430296b2746513f60931" + } + }, + "945ac449c2e84fd6b5a7805b017343f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "96e2d208830f48cd821be7e59643c93e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2fe0a2fa22a0498da983ec38150216e6", + "placeholder": "​", + "style": "IPY_MODEL_f3784e85cef34bdba64b611a1f5883e4", + "value": " 792k/792k [00:00<00:00, 7.01MB/s]" + } + }, + "96e4e44a789a46ce8239b260bf6e3dc8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "972ddebd536d4685bfc3c7c13e5bd8be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "978bbbf33d304588af971d22bb2a3690": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "99091ca45c1b4809ba0a1b01af85f528": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d0500a0f5f74be39e5edfbbcd7a64fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9d599c2a4d9f4f2db1e4b3183c18eb94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf7a49e0e4a64df6b1b1c66e5e73c3a6", + "max": 8862, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d27ff6c2869242b98564b0e03d68b413", + "value": 8862 + } + }, + "9d68afcb8e26420cb91ea1eb872c80c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a72073cfb8b4422a98ca581c4e5d18b8", + "placeholder": "​", + "style": "IPY_MODEL_3a5712c976b04af0975804b34344dfcf", + "value": "100%" + } + }, + "9e5afa2048c74754816b34a34171fcb0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a0186b2194df4a0a9cd1ac49054d68da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a10078c15aae4ec6a849f1b58c6b1cc2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_223848818aff4af1ab5d5e14271408e3", + "placeholder": "​", + "style": "IPY_MODEL_4b4b31109a9746e88ffa9b47bab00e53", + "value": " 1/1 [00:00<00:00, 12.10ba/s]" + } + }, + "a38c0fedf90a4f3cbb4680b5f85bbf2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a3bb3f44c1754082a4f5169431c5b760": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a43ddb478f044f17adbcfae841ec2114": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a4857f97132a41acbe4535b03cd8d94a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4d6de73a37148bf9303a273d13cd091": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a72073cfb8b4422a98ca581c4e5d18b8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a9effc13b52044a5bc0d6a2a1088396f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aa2c51ad05c14a02a13e5c047779fc05": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2ce6779fa5904471945fa5738510af64", + "IPY_MODEL_bdff3b35dcdf49e5ba2c5c2498773cb7", + "IPY_MODEL_6994741f3113493b9d5bba278b8732f5" + ], + "layout": "IPY_MODEL_808a41f78a7c4ae0b6aafee59c6234ae" + } + }, + "aa91ad725da147bc8cab70f931d82672": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8a2bd1b4d9ba47ef9e77048e3d2d1e83", + "placeholder": "​", + "style": "IPY_MODEL_dbd908538859410f9c20536fe5acb328", + "value": " 8.86k/8.86k [00:00<00:00, 381kB/s]" + } + }, + "aaa1477cfabb4767b755e902d3b99e61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a38c0fedf90a4f3cbb4680b5f85bbf2f", + "max": 6036, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c6a28dcd88c1487ab17aef6946ada876", + "value": 6036 + } + }, + "b560fc36ee8f424f9590e04a042046fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f2a36b126c1b41848e61b0c581ff8c4b", + "placeholder": "​", + "style": "IPY_MODEL_6a40d6535f9e4b5b9c9283a1cd67687a", + "value": "Downloading (…)okenizer_config.json: 100%" + } + }, + "b718fba0f1514025a0ca22e7f780a2fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0271e1cc4e2d43c69d4959e46eddec9a", + "IPY_MODEL_601fb3752e134641b28da908d4e7b65a", + "IPY_MODEL_e002207d6982491cbef196f25fc891f8" + ], + "layout": "IPY_MODEL_a4857f97132a41acbe4535b03cd8d94a" + } + }, + "b77afc7c1f184de0970feb2df8ac5285": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5bfebc75ec424c6cb41b33d210d28d2b", + "max": 662, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ef2fa44d0105457c9aed3812633dd329", + "value": 662 + } + }, + "b8087054f46c44cab9bd62fa23fbf9de": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5192bc282c4847cb9df8365fc22a6cc2", + "placeholder": "​", + "style": "IPY_MODEL_608e9f7a14054573b9bd07f0f74b6345", + "value": " 2.20k/2.20k [00:00<00:00, 84.9kB/s]" + } + }, + "b8944b7027d449b4a7fc752978f463b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_390b88f67b84451999b0845483905144", + "placeholder": "​", + "style": "IPY_MODEL_be9c243b74d944eb82ca1fe4ada6721d", + "value": " 1303/2264 [00:00<00:00, 4889.47 examples/s]" + } + }, + "b8bb0aed01d04e8dad560df1b051e1e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_93fe5a8fafbc44b496309d1a8da77ac5", + "placeholder": "​", + "style": "IPY_MODEL_a9effc13b52044a5bc0d6a2a1088396f", + "value": " 3/3 [00:00<00:00, 17.02ba/s]" + } + }, + "baf53867f52046c182a2b1755f02e136": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": "hidden", + "width": null + } + }, + "bc24304c057d4b5898e832818de55caa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bca79be79b6d4a68b148255bba86ea96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_74e88bd01bf14e0e9f772f993c92eb77", + "placeholder": "​", + "style": "IPY_MODEL_401bd48c5b2d48eb86a1499912ee2b44", + "value": " 682k/682k [00:00<00:00, 6.99MB/s]" + } + }, + "bcda86e43607436583f1fbfee08a9786": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bdfb4a04e48246a4b0890f52d6dd424b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bdff3b35dcdf49e5ba2c5c2498773cb7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df8fbdbe9bc341e3a39a7bda99b70be2", + "max": 147, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_179a912bbd1e454eba503782b675efa8", + "value": 147 + } + }, + "be11f6865f6c41b5a57b2b7f4a85e14c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_85e5c1a9b7ac4e6e884213a636d0aaa1", + "placeholder": "​", + "style": "IPY_MODEL_216e5237b31944cbab006d9761ade0a1", + "value": "Downloading data: 100%" + } + }, + "be9c243b74d944eb82ca1fe4ada6721d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bef1971d92e6479696e3f9a27a757b8a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bf7a49e0e4a64df6b1b1c66e5e73c3a6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c0deb08457be4a3ebb3947e33f7ce1df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cba58e0b316b439ab035b917a40c630c", + "placeholder": "​", + "style": "IPY_MODEL_de6718209a7a42b0809e97fcd97e09ed", + "value": "100%" + } + }, + "c149c1c53e9d44008a86944ef8c261c5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c1757a5b684f4496a4b0e3db544bf44b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c31cffaa6934407399856235a2f3af54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a4d6de73a37148bf9303a273d13cd091", + "max": 681890, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f79eeece093f4b0e9de6dbc346a3fa19", + "value": 681890 + } + }, + "c6a28dcd88c1487ab17aef6946ada876": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c708031a279e4e55ac7833e6697f93bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c7771479ae4e4efab744fad6da586fd3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cba58e0b316b439ab035b917a40c630c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cce112d791dd4b748908756e785ab555": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_be11f6865f6c41b5a57b2b7f4a85e14c", + "IPY_MODEL_c31cffaa6934407399856235a2f3af54", + "IPY_MODEL_bca79be79b6d4a68b148255bba86ea96" + ], + "layout": "IPY_MODEL_5093700dd3a14cc1a283d18a4a0e17a7" + } + }, + "ce0213e9d6aa45c5a9ac9954fbe15f62": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7cfda0921e5a4f378e90e057447f3b3d", + "IPY_MODEL_aaa1477cfabb4767b755e902d3b99e61", + "IPY_MODEL_7b6bddd4ca51495dbc2fceba7c50706f" + ], + "layout": "IPY_MODEL_6317d49813234f5b9103b249cf648c2c" + } + }, + "cfc78731f7d543ce8529cc254d92ddf5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cfc7aa04c11d408c9c12cdbd9cff4bb5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d27ff6c2869242b98564b0e03d68b413": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d48cbb38078b456fab1634bec5b0a1ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_45141234ce584f208a9d301faadf75d2", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f62ffbdc24734b999f36058d9edca81f", + "value": 1 + } + }, + "d5b57d3c74d14e5d80d1ef634c103a40": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d618ec6be7d14a239b3bc74172616bf2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d72e8b3419f240f2bdce253cce9d24e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d7d9e2e2090d4226ad89e5ba9cec33df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5cc620a232bf4d418c3fc882f4c1cd0c", + "IPY_MODEL_f742450a607c4ed0bff98ac9b7685d40", + "IPY_MODEL_e87b05e685b040f7a99450bfbab72433" + ], + "layout": "IPY_MODEL_e1c8e6f843604161bbb6cbd269488469" + } + }, + "da99eed13d524b8fb95dbc563eb2d044": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbd908538859410f9c20536fe5acb328": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dbdb787728184aa1a6906f96c5e6f929": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e3be963920c84c7fbe7e0bc61b8e778d", + "IPY_MODEL_1275c5a5c88b435a897f88a19c54a0a5", + "IPY_MODEL_81d2f0953e104fc1ad57295819b6b689" + ], + "layout": "IPY_MODEL_61ab054f49884b1fadf529a39ccc37dc" + } + }, + "dd53a486f7b5403a81e2be89cbbda719": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd993a4a7cdf40448098544c95468a10": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "de6718209a7a42b0809e97fcd97e09ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "df8fbdbe9bc341e3a39a7bda99b70be2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e002207d6982491cbef196f25fc891f8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d11cb45c5cb472aa86722e4dbb8c085", + "placeholder": "​", + "style": "IPY_MODEL_34142a8e97594931b316970911679e55", + "value": " 3/3 [00:00<00:00, 2.89ba/s]" + } + }, + "e1769695dffd4ebeb79a63ff4812fa9e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f66e179caa8b4393bed19a0488821c47", + "placeholder": "​", + "style": "IPY_MODEL_cfc7aa04c11d408c9c12cdbd9cff4bb5", + "value": " 2.54k/2.54k [00:00<00:00, 95.8kB/s]" + } + }, + "e1c8e6f843604161bbb6cbd269488469": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e24115bb662c428e89c2c4421915e632": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e293930c8e2c4eadbda53005e21ec450": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3be963920c84c7fbe7e0bc61b8e778d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_924e6a8308fc47af929aca1987a12f09", + "placeholder": "​", + "style": "IPY_MODEL_c1757a5b684f4496a4b0e3db544bf44b", + "value": "Downloading (…)"pytorch_model.bin";: 100%" + } + }, + "e87b05e685b040f7a99450bfbab72433": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_06cea508d7504b228f6cebc66742d200", + "placeholder": "​", + "style": "IPY_MODEL_9d0500a0f5f74be39e5edfbbcd7a64fc", + "value": " 2.42M/2.42M [00:00<00:00, 4.03MB/s]" + } + }, + "eacf8e9ed6e847faae2b8ecab283ddc4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ebc26228160046c48279d71770c928d8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ec73524ed7f14ea0b67f07d72eada173": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef2fa44d0105457c9aed3812633dd329": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f00b73eb32374c33882c1bfc49822e44": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f061a6deaa73484aa04f219bba6a4329": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f14657da8e1e4298a96e3885eb4eee93": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2a36b126c1b41848e61b0c581ff8c4b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f2dc5e8a31c348358aca916274899e8b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2bfb7c240e154769a0d58a3ceaa20212", + "IPY_MODEL_8f339c9070f046dab46ebc35c1cc2dba", + "IPY_MODEL_34db70b6e6ec475699fd23a2d6c3a973" + ], + "layout": "IPY_MODEL_e24115bb662c428e89c2c4421915e632" + } + }, + "f3784e85cef34bdba64b611a1f5883e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f48454eadbfb4953b719bdf44555c90e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3c5affff513341b29e6a2c1c90bfe334", + "IPY_MODEL_0bbeca449a814d95bec438a9141b2b6b", + "IPY_MODEL_a10078c15aae4ec6a849f1b58c6b1cc2" + ], + "layout": "IPY_MODEL_06e8fd84d6224e5096088d66aad71961" + } + }, + "f4ff06e2c48d4e58abe64cb7f41dd886": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f57cfa7cb3b4199babf82dc9d93b074", + "placeholder": "​", + "style": "IPY_MODEL_053de11f995247f6b851909a6a8dfc16", + "value": "Downloading (…)cial_tokens_map.json: 100%" + } + }, + "f62ffbdc24734b999f36058d9edca81f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f66e179caa8b4393bed19a0488821c47": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f742450a607c4ed0bff98ac9b7685d40": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e293930c8e2c4eadbda53005e21ec450", + "max": 2424064, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_945ac449c2e84fd6b5a7805b017343f2", + "value": 2424064 + } + }, + "f79eeece093f4b0e9de6dbc346a3fa19": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f9620e01cd6749f88b722a42ff68c502": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fb6877c376e0430296b2746513f60931": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd08c4fbe5d84dd893d87a5e2f2d082d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f30253108fb4dce9c3de029457ef6f1", + "placeholder": "​", + "style": "IPY_MODEL_1fa2a7e3ff3c4c99ab95e96a28624846", + "value": " 1/1 [00:00<00:00, 22.88ba/s]" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/int8_training/Finetune_opt_bnb_peft.ipynb b/peft/examples/int8_training/Finetune_opt_bnb_peft.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..99ab2a30e763d8fe6dce3ce9637bb629a73f10b9 --- /dev/null +++ b/peft/examples/int8_training/Finetune_opt_bnb_peft.ipynb @@ -0,0 +1,9276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "WE5GJ6s7y0Xo" + }, + "source": [ + "## Fine-tune large models using 🤗 `peft` adapters, `transformers` & `bitsandbytes`\n", + "\n", + "In this tutorial we will cover how we can fine-tune large language models using the very recent `peft` library and `bitsandbytes` for loading large models in 8-bit.\n", + "The fine-tuning method will rely on a recent method called \"Low Rank Adapters\" (LoRA), instead of fine-tuning the entire model you just have to fine-tune these adapters and load them properly inside the model. \n", + "After fine-tuning the model you can also share your adapters on the 🤗 Hub and load them very easily. Let's get started!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TfBzP8gWzkpv" + }, + "source": [ + "### Install requirements\n", + "\n", + "First, run the cells below to install the requirements:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "otj46qRbtpnd", + "outputId": "2aa109f6-3f4e-4887-a16e-336f51e7cc9a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.3/76.3 MB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.8/462.8 KB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 KB\u001b[0m \u001b[31m25.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.3/190.3 KB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 KB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 KB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m140.6/140.6 KB\u001b[0m \u001b[31m20.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.6/7.6 MB\u001b[0m \u001b[31m72.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for transformers (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Building wheel for peft (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!pip install -q datasets==3.6.0 accelerate\n", + "!pip install -q git+https://github.com/bitsandbytes-foundation/bitsandbytes.git\n", + "!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FOtwYRI3zzXI" + }, + "source": [ + "### Model loading\n", + "\n", + "Here let's load the `opt-6.7b` model, its weights in half-precision (float16) are about 13GB on the Hub! If we load them in 8-bit we would require around 7GB of memory instead." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 408, + "referenced_widgets": [ + "d4de260ffd8a440eb87eb900fc1bb1d3", + "8602b545a9f8474dbb3cc178ac0b8e60", + "b46919912ee54f6f9f2ce9080be1c61a", + "50374e3ab81c4626a182e61fc03b94ce", + "2144bc2897dc40b29f060e30ace12275", + "949ca70002ca4472bbc21fea4d7ac745", + "49943c9dadca43a584b3f354ba45280c", + "6123e53fb26b41f0af9a3a3348ae1afd", + "285ef943d540400ab827c462945a259c", + "95727290446244ccb9626f4594949675", + "61b54aa6c9e94ee1bc45c15a9e3f7917", + "fc2d5ffe254d425b939252ec46ec27cc", + "f65af2e868244edeb0cc9402534874a8", + "e466054f08004bbcabb24e400cb3c7fc", + "6ea40800dfd849e3b106bae71fc53ae3", + "722a01f42b7d4c38836a4546ecb38108", + "1bd5179cdb474b65aa06eca3520ad37b", + "04d367124a3b419ab1fa1dfd4f9004c3", + "4ea667f48b9f4e1f9da7c5a0d3025b85", + "9b96c63630654773acd38b8b88371f28", + "cb69ae47666a4603a07a8778e2ae7d6e", + "8f5e9f2d11d54fd2a08dcbff9f6da05c", + "c6f712eadc4d49019b2bd355968cc155", + "5fd979c05fd34311af877ce1a988ead8", + "179165ff4c0e4586aaf3a40b8502f428", + "cad1d8326e474a4f9ee13db9005121dd", + "b049ac0b3d2e44ba9b1fc1cb2dd3de62", + "921c268ecd724b6a8869dae8f81d558a", + "4cf85a85c0764f15b5134c81b360e910", + "5936d9eb59e147eea6482006decfe0ee", + "fad61c68edc84c8197afeafded84280d", + "451309d8b3ca4cd9b6f538640477039e", + "c7621e14aa16421d9758321e433b92e4", + "5aa74b9b30614172b07f88873cf89471", + "cf39fb025e3d4635a5695135a56d9f64", + "deaf9732f33446a3be015d2ec16aba76", + "bdfd856dd8ee4ae09205ad9d1b9cc806", + "7a49f7a55b054b6d829f290a1a426a7a", + "5b695486a0ac442b8b6a8ef2ebf4e57a", + "03be7cb91cba4afab795aab7aa242ee1", + "8f7890f54d514f80b1eb17905cf0f964", + "ae660d51267544a89f0ad199cc12b6fa", + "d5bb8d7359274c8e9cc79df563175137", + "98d5a80ec50c46c18f9ee991e2982115", + "e73e5388182040a8937ccf1748171a87", + "f797f7b8b13f4b3cb871522d34498631", + "41f47c864e094cacb1c550d37ddfe80e", + "104d982b444947fe8e4fbb2c2f082616", + "2cf8581583c641fb95d1e16aff7d4cd1", + "11808e9097424dd0b22a5af6c77813f3", + "dc08d237860e4788a8ceeff4518c2612", + "7bbf2c40a4ae46fe9e5885db08975263", + "f00cbc4a89f5492787ec489da65ff70b", + "8a7cd2194113493a841b00d034b5f1ba", + "e9673363e85448d494ac9ac5d7ba0efb", + "a994beafbf3f4c20880a7bbe3898db36", + "7769c261781f4f5483c7e9d58c1a5573", + "4a713a8fe16f4e81aa841c69711fa136", + "08d036904ecf47ed88e129ba6e2b285c", + "9f1182fdddab43b59ee98bc701965a17", + "c2302f535b114ad780bfa440445c2e28", + "8fe032f285ba4858b8efd68119c217b0", + "4ca695fca3d140c6ab4e1e1d34df807b", + "b77ceaf55dc04810963cdd01126478f6", + "0ccdefcb25e14d229b2634ffae4a6d3d", + "11d6b952503c4824b36b66e228f87599", + "1e9391f6c89c4d08859ef3413edb19be", + "039bbda2402f469eb21ba7ec7ea589a6", + "5da6eef8fb0048219159f38a68727b64", + "c006e62ee6d04831b2b89273ef04a8ac", + "af0634c2539a43989902daef47776901", + "62fea00ef0364af287e6097b964d00c4", + "9e5ef73f8b244845a6b9002fa5c35d15", + "5cb09c84e1e144e0a92580fb5e1ce2fe", + "0f561c1660744251a8f710b69d434c87", + "9e5f870ef80242f5af09fad70f84ea62", + "6f173ca73dd545deb22b8cd0470d925f", + "4e6d5943bc374b388b93ed115e44b6a5", + "cbe2a6ea41834e95a27d6f02c3c0eeec", + "7beb5f4efefc4593abac253df74d1405", + "370cefbaeebf41f582b7507ad493055f", + "e3df9dda16e244aa9b61d54f9e21ef40", + "23d2ad64a17041b7a006dad1e041e0a1", + "c1fd6a1234274a44b838a09f3f5380c6", + "81bb51d088374394becd9a45ec3b17d4", + "9e762779e5434bb7afcc295b61c2f4e4", + "f539b7a4665449de9eac209a20629969", + "32d528db79ad4f6f836ab2e0df5ac426", + "1ca7684b79c5438fa06b047bd2b3283f", + "a07688185bff4c4b8cbed3af3b4cf802", + "0272f1d9f93f4dd788363a8409cdfd69", + "27b41d23d2c64127ba3ae8464958f855", + "2b54032c0d8e4a2897aed1ac1c79af14", + "ed8fa1048e814f2fa3666899fc42e55a", + "97daf559100c44ac983562fea93c5fac", + "72e511b775604d899ff5b3fa2ebe9fc4", + "da946f86590447d2ab98b9da468fa66b", + "54fe79d5c7254117a2209927a7248dd4", + "1d122e4eaad54e06961288484f31e18b", + "d46b5725c35142a89617e46c0e8d3679", + "c5493c23fd5542738ffd1ff5f09a6a67", + "a1a80d3460984c2496ada5a634875934", + "598c5584ffba4f26815c4e87bb1595c4", + "fe93f25323604447be0bb1d24a0c2c59", + "00c2e2d3ee8b45818ba84da12c6b11e2", + "6ccea64c2e614a9fbdcc2f716cecaea0", + "63fc9a9eebca4f2db2ed8a385fc5e204", + "9a4860dfeac944db85e6e532599bc1cb", + "3b946e1bbab24629b98307275fbe7cbb", + "d9d36f8ff5f747bf90fbc8a7d35a6664" + ] + }, + "id": "cg3fiQOvmI3Q", + "outputId": "135a7675-6a4d-4786-b5dc-34cb867f40c7" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bee2f575b3e64c30b2f3afa137802406", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00\n", + " \n", + " \n", + " [153/200 26:04 < 08:06, 0.10 it/s, Epoch 0.97/2]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining Loss
12.364400
22.200400
32.302300
42.184700
51.878700
62.307200
72.193800
82.446200
92.458900
102.020000
111.941200
121.931000
132.055900
141.975100
152.015100
162.095600
171.768300
182.155700
192.402300
202.124600
212.314900
221.908500
232.078800
241.941900
251.879800
261.927500
271.371400
281.977600
292.055000
301.915800
311.958100
322.195900
332.001000
342.025000
351.576900
361.879800
371.821600
381.727800
391.995700
401.698600
412.129300
422.025800
431.696500
441.984700
452.051100
462.054400
471.765600
482.063100
491.746900
501.873000
512.391300
522.494100
532.072300
541.808000
551.911900
562.168100
572.166100
581.921500
591.856000
601.652800
611.605000
622.032500
631.822100
641.623600
651.923200
662.053200
672.114300
681.807700
691.857800
701.854600
712.023000
721.864900
731.769300
741.837700
751.742200
761.895900
771.922800
782.325300
792.231200
802.309500
811.945700
822.072100
831.917400
842.004600
851.951700
861.450600
871.785600
881.668000
891.903100
901.709800
912.312900
922.092100
932.319600
941.603100
951.740000
961.670500
971.611600
981.728900
992.285200
1001.957800
1011.676700
1021.656300
1031.612400
1041.848900
1051.870000
1061.954000
1072.192200
1081.637600
1091.208700
1102.254200
1111.832100
1122.119600
1132.126400
1141.915700
1151.587500
1161.564800
1171.742700
1181.712600
1191.727900
1202.361500
1212.070300
1221.878500
1231.846600
1242.061700
1252.149700
1261.940600
1272.098300
1281.734100
1292.111700
1301.887600
1311.716300
1322.070000
1331.782200
1341.955200
1351.762900
1361.954700
1371.687100
1381.979100
1391.634600
1401.801200
1411.954100
1421.833900
1432.051400
1441.921200
1451.787500
1461.825400
1471.363400
1481.977400
1491.768300
1502.226700
1511.945500

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import transformers\n", + "from datasets import load_dataset\n", + "\n", + "data = load_dataset(\"Abirate/english_quotes\")\n", + "data = data.map(lambda samples: tokenizer(samples[\"quote\"]), batched=True)\n", + "\n", + "trainer = transformers.Trainer(\n", + " model=model,\n", + " train_dataset=data[\"train\"],\n", + " args=transformers.TrainingArguments(\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " warmup_steps=100,\n", + " max_steps=200,\n", + " learning_rate=2e-4,\n", + " fp16=True,\n", + " logging_steps=1,\n", + " output_dir=\"outputs\",\n", + " ),\n", + " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", + ")\n", + "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Duak7T_B3VpJ" + }, + "source": [ + "## Share adapters on the 🤗 Hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 331, + "referenced_widgets": [ + "262f01ffc5824b5faa8a61afac12ff67", + "c1af10b599da43a3a848f3ba816d7acc", + "be673691e713472980fa1132465714b4", + "01f8b90f0f184dfdb92c0c3bffb28b0f", + "9ad2bd0e92174d339a3a91a38253a180", + "bff17ef6aabd4aa681bfd5ad64b808d9", + "55633800b60a4336abea6a4adfcfdec1", + "73d5c6b4034d49b392b103d889bfb3b4", + "a55954b8d7bf4057b0d7aa6a1cb9e91a", + "4da42eb3846f423d88e2a6462a0cfce8", + "dab39ef354a84be3b37b6f151f9d9b9d", + "b88b03326f464c96a5656eef774e36d5", + "4eccb670e98043b3b2702821a3060ece", + "a333501a50df4b9fa9546d8d965e0dc3", + "1f173cb95c5c44f4b32f6cfe10ee3b03", + "52c8a7e673f24276a07042388a13b58f", + "a0f323ccfbc14fc4b7a5e7046b221ce3" + ] + }, + "id": "DpYr24pR8T_0", + "outputId": "20186456-1bd4-4655-b2f2-8f24f9f37fcc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token is valid.\n", + "Your token has been saved to /root/.cache/huggingface/token\n", + "Login successful\n" + ] + } + ], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 133, + "referenced_widgets": [ + "3dbe077ed0c34e4eb1628418138ccbc6", + "a579cb7804774ca3aa9efd800d1af57e", + "9f44e3175835470aba43e239661037b2", + "549bd12e3af64256a7534903688835a8", + "a18d9beb34b848ff8cc541d2cb290c4c", + "876cb42184f54b749c442163290c2c45", + "43b14e0e1263499dbd592b280cff21b0", + "7164feaf360d4300a5083f95064b144a", + "bd7547126a874a45b608afed7ab2958b", + "26e4b6b94e4540728c59df8e481ce43d", + "9d1b77500f1c45308d4791a9c443e307", + "b2693135b6954d35afb3120f3caf4000", + "aeca2429ee48450a814515cb06acbc3e", + "434c09950be04d728cd7ca8d6c134dc6", + "34f58c132d2e4249b1e62a0b57c85999", + "d148369be26a43949257790cb202728d", + "0ecf58c5cbcd40908595fccddff1c6d4", + "fc2314656a2745eb921f636cc3451381", + "7817f8669b7f449fadf02d7145fa89e2", + "06e012eea9714ed589344a362b7421bb", + "504e9e5ced0348cc87aafae0c1c372eb", + "5b538e8389fb4574a5dfdc554624e3c8" + ] + }, + "id": "VxB6UV5XAvvP", + "outputId": "c3b0133b-f5b1-4283-8367-f06524bea46c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading the following files to ybelkada/opt-6.7b-lora: adapter_config.json,adapter_model.bin\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dbe077ed0c34e4eb1628418138ccbc6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00Pro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "c006e62ee6d04831b2b89273ef04a8ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9e5f870ef80242f5af09fad70f84ea62", + "placeholder": "​", + "style": "IPY_MODEL_6f173ca73dd545deb22b8cd0470d925f", + "value": " 685/685 [00:00<00:00, 34.4kB/s]" + } + }, + "c1af10b599da43a3a848f3ba816d7acc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_73d5c6b4034d49b392b103d889bfb3b4", + "placeholder": "​", + "style": "IPY_MODEL_a55954b8d7bf4057b0d7aa6a1cb9e91a", + "value": "


Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "c1fd6a1234274a44b838a09f3f5380c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c2302f535b114ad780bfa440445c2e28": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2b42681b8bb47e3895d6105240c5812": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c41a9d785e884ab0a58117d17ac7d228": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c423a3cc0b504828b11077c77268ed92": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c4ab408eb1344da0bb15a9a6760818e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c5493c23fd5542738ffd1ff5f09a6a67": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_00c2e2d3ee8b45818ba84da12c6b11e2", + "placeholder": "​", + "style": "IPY_MODEL_6ccea64c2e614a9fbdcc2f716cecaea0", + "value": "Downloading (…)cial_tokens_map.json: 100%" + } + }, + "c57c7cf35bf04f3bb3b2b0d8ce7feb31": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c6f712eadc4d49019b2bd355968cc155": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5fd979c05fd34311af877ce1a988ead8", + "IPY_MODEL_179165ff4c0e4586aaf3a40b8502f428", + "IPY_MODEL_cad1d8326e474a4f9ee13db9005121dd" + ], + "layout": "IPY_MODEL_b049ac0b3d2e44ba9b1fc1cb2dd3de62" + } + }, + "c7621e14aa16421d9758321e433b92e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c81d20fe47ce4b7594427830d71504d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a14542c8431c48b48a614cfd0d41f03c", + "IPY_MODEL_856f3dcf949741acb394f252186a1d7e", + "IPY_MODEL_865bae11c917492a9a1ef7286a493bd5" + ], + "layout": "IPY_MODEL_2561b7a7c1694f229d30d2b1eeb14b2f" + } + }, + "c9b718882fec4254bce1f33fa9373921": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4884bf82f5814e049c47cf6d496aab08", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_432fc8277ebc492f91d6b46ed073ccb4", + "value": 1 + } + }, + "cad1d8326e474a4f9ee13db9005121dd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_451309d8b3ca4cd9b6f538640477039e", + "placeholder": "​", + "style": "IPY_MODEL_c7621e14aa16421d9758321e433b92e4", + "value": " 9.96G/9.96G [03:05<00:00, 63.7MB/s]" + } + }, + "cb69ae47666a4603a07a8778e2ae7d6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cbe2a6ea41834e95a27d6f02c3c0eeec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23d2ad64a17041b7a006dad1e041e0a1", + "placeholder": "​", + "style": "IPY_MODEL_c1fd6a1234274a44b838a09f3f5380c6", + "value": "Downloading (…)olve/main/vocab.json: 100%" + } + }, + "ce6de6f9ddde4a6d8094a2b96eac3a4e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce8d4bac782949579a2e52864455d9de": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf39fb025e3d4635a5695135a56d9f64": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5b695486a0ac442b8b6a8ef2ebf4e57a", + "placeholder": "​", + "style": "IPY_MODEL_03be7cb91cba4afab795aab7aa242ee1", + "value": "Downloading (…)00002-of-00002.bin";: 100%" + } + }, + "d148369be26a43949257790cb202728d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d3511ce1754b41969e5c36a5b33ac466": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d46b5725c35142a89617e46c0e8d3679": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c5493c23fd5542738ffd1ff5f09a6a67", + "IPY_MODEL_a1a80d3460984c2496ada5a634875934", + "IPY_MODEL_598c5584ffba4f26815c4e87bb1595c4" + ], + "layout": "IPY_MODEL_fe93f25323604447be0bb1d24a0c2c59" + } + }, + "d4de260ffd8a440eb87eb900fc1bb1d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8602b545a9f8474dbb3cc178ac0b8e60", + "IPY_MODEL_b46919912ee54f6f9f2ce9080be1c61a", + "IPY_MODEL_50374e3ab81c4626a182e61fc03b94ce" + ], + "layout": "IPY_MODEL_2144bc2897dc40b29f060e30ace12275" + } + }, + "d4f5b19f75e246df9c688f625792e8ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a6ce291698ad460394433a49000c1d25", + "max": 646739, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_aa54b2a9b43848b0902d135beffb806b", + "value": 646739 + } + }, + "d54e8d69575f49eb977da64abc5ceb0c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5bb8d7359274c8e9cc79df563175137": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d72bd17e161442b0979dceaaef66d82c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f592d7632fd440aac0bf97ceed2de75", + "max": 33601485, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1b515483e5884479b2101127c16321d4", + "value": 33601485 + } + }, + "d7e33c29d410414eb452d121edd9920e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d9d36f8ff5f747bf90fbc8a7d35a6664": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "da946f86590447d2ab98b9da468fa66b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dab39ef354a84be3b37b6f151f9d9b9d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dc08d237860e4788a8ceeff4518c2612": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ddc333530c13446a91cf332846bfa22f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d7e33c29d410414eb452d121edd9920e", + "placeholder": "​", + "style": "IPY_MODEL_d3511ce1754b41969e5c36a5b33ac466", + "value": " 0/0 [00:00<?, ? examples/s]" + } + }, + "ddc36fdbdd634dc489f658bead61e7ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "deaf9732f33446a3be015d2ec16aba76": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8f7890f54d514f80b1eb17905cf0f964", + "max": 3356360185, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ae660d51267544a89f0ad199cc12b6fa", + "value": 3356360185 + } + }, + "e1f62cbd805d4b8aa9aba7e345c21c82": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3df9dda16e244aa9b61d54f9e21ef40": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e466054f08004bbcabb24e400cb3c7fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4ea667f48b9f4e1f9da7c5a0d3025b85", + "max": 41937, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9b96c63630654773acd38b8b88371f28", + "value": 41937 + } + }, + "e73e5388182040a8937ccf1748171a87": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f797f7b8b13f4b3cb871522d34498631", + "IPY_MODEL_41f47c864e094cacb1c550d37ddfe80e", + "IPY_MODEL_104d982b444947fe8e4fbb2c2f082616" + ], + "layout": "IPY_MODEL_2cf8581583c641fb95d1e16aff7d4cd1" + } + }, + "e9673363e85448d494ac9ac5d7ba0efb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ed2736d862a94d8f9db9ba6037016071": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3185bd8ecbde4f26b8ed0f92cf79e14f", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ddc36fdbdd634dc489f658bead61e7ee", + "value": 1 + } + }, + "ed8fa1048e814f2fa3666899fc42e55a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f00cbc4a89f5492787ec489da65ff70b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f27905d0073e493cb9dcd174c0f15e35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c423a3cc0b504828b11077c77268ed92", + "placeholder": "​", + "style": "IPY_MODEL_668dd47eec9942bcad0af209772cf8e6", + "value": " 1/1 [00:00<00:00, 9.58it/s]" + } + }, + "f357166c6e5f43f39d0a287ca6d6f60e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_805a10c2fd794ff692e8ebeefd65f2eb", + "IPY_MODEL_3730f843399d4ba48e98383563283e94", + "IPY_MODEL_f855aced7ca2485ea720604359deaa18" + ], + "layout": "IPY_MODEL_6d728366de1a4bacb1ba1939c5e0146f" + } + }, + "f51c7f18977447e2bca36e1da3e1be4f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f539b7a4665449de9eac209a20629969": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f65af2e868244edeb0cc9402534874a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1bd5179cdb474b65aa06eca3520ad37b", + "placeholder": "​", + "style": "IPY_MODEL_04d367124a3b419ab1fa1dfd4f9004c3", + "value": "Downloading (…)model.bin.index.json: 100%" + } + }, + "f797f7b8b13f4b3cb871522d34498631": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_11808e9097424dd0b22a5af6c77813f3", + "placeholder": "​", + "style": "IPY_MODEL_dc08d237860e4788a8ceeff4518c2612", + "value": "Loading checkpoint shards: 100%" + } + }, + "f855aced7ca2485ea720604359deaa18": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_49504531deaf4449938bea751d1ec4e7", + "placeholder": "​", + "style": "IPY_MODEL_7dfe540a75864cf390b3bed20ab1dcd9", + "value": " 5.55k/5.55k [00:00<00:00, 156kB/s]" + } + }, + "fad61c68edc84c8197afeafded84280d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "faf24b3ed994422f8dd806ae0cc30531": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_86dff0987bf040da99c8f2846da26d86", + "placeholder": "​", + "style": "IPY_MODEL_63f8ad255d2147128bdc26f47fdf2528", + "value": " 3/3 [00:02<00:00, 1.18ba/s]" + } + }, + "fc15c6d6eb3049a3b8542b332dd8a3f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fc2314656a2745eb921f636cc3451381": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fc2d5ffe254d425b939252ec46ec27cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f65af2e868244edeb0cc9402534874a8", + "IPY_MODEL_e466054f08004bbcabb24e400cb3c7fc", + "IPY_MODEL_6ea40800dfd849e3b106bae71fc53ae3" + ], + "layout": "IPY_MODEL_722a01f42b7d4c38836a4546ecb38108" + } + }, + "fe93f25323604447be0bb1d24a0c2c59": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff2454cf69b346fea70070522cf93689": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_87b36eb4ad3b4047a32c7d67a5aabc5e", + "IPY_MODEL_8bf9fd4bd28e4bc1b5895bc9315e727e", + "IPY_MODEL_01845549768a4db580b5555809e83342" + ], + "layout": "IPY_MODEL_a490025901df478f93be1f19c1be4b09" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/peft/examples/int8_training/config.yaml b/peft/examples/int8_training/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c5d87a578372648e677ce66828e35aec5b49e9e5 --- /dev/null +++ b/peft/examples/int8_training/config.yaml @@ -0,0 +1,19 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_XPU +downcast_bf16: 'no' +enable_cpu_affinity: false +gpu_ids: all +ipex_config: + ipex: false +machine_rank: 0 +main_training_function: main +mixed_precision: 'no' +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/peft/examples/int8_training/fine_tune_blip2_int8.py b/peft/examples/int8_training/fine_tune_blip2_int8.py new file mode 100644 index 0000000000000000000000000000000000000000..1b67013783a0721b2bac5d53d8ba22aec82043c4 --- /dev/null +++ b/peft/examples/int8_training/fine_tune_blip2_int8.py @@ -0,0 +1,104 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader, Dataset +from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig + +from peft import LoraConfig, get_peft_model + + +# Let's define the LoraConfig +config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", +) + +# We load our model and processor using `transformers` +model = AutoModelForVision2Seq.from_pretrained( + "Salesforce/blip2-opt-2.7b", quantization_config=BitsAndBytesConfig(load_in_8bit=True) +) +processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") + +# Get our peft model and print the number of trainable parameters +model = get_peft_model(model, config) +model.print_trainable_parameters() + +# Let's load the dataset here! +dataset = load_dataset("ybelkada/football-dataset", split="train") + + +class ImageCaptioningDataset(Dataset): + def __init__(self, dataset, processor): + self.dataset = dataset + self.processor = processor + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, idx): + item = self.dataset[idx] + encoding = self.processor(images=item["image"], padding="max_length", return_tensors="pt") + # remove batch dimension + encoding = {k: v.squeeze() for k, v in encoding.items()} + encoding["text"] = item["text"] + return encoding + + +def collator(batch): + # pad the input_ids and attention_mask + processed_batch = {} + for key in batch[0].keys(): + if key != "text": + processed_batch[key] = torch.stack([example[key] for example in batch]) + else: + text_inputs = processor.tokenizer( + [example["text"] for example in batch], padding=True, return_tensors="pt" + ) + processed_batch["input_ids"] = text_inputs["input_ids"] + processed_batch["attention_mask"] = text_inputs["attention_mask"] + return processed_batch + + +train_dataset = ImageCaptioningDataset(dataset, processor) +train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=2, collate_fn=collator) + +optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + +model.train() + +for epoch in range(50): + print("Epoch:", epoch) + for idx, batch in enumerate(train_dataloader): + input_ids = batch.pop("input_ids").to(device) + pixel_values = batch.pop("pixel_values").to(device, torch.float16) + + outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids) + + loss = outputs.loss + + print("Loss:", loss.item()) + + loss.backward() + + optimizer.step() + optimizer.zero_grad() + + if idx % 10 == 0: + generated_output = model.generate(pixel_values=pixel_values) + print(processor.batch_decode(generated_output, skip_special_tokens=True)) diff --git a/peft/examples/int8_training/peft_adalora_whisper_large_training.py b/peft/examples/int8_training/peft_adalora_whisper_large_training.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2b07d7b5a03aef58ef2b2a37336ff54329f5b5 --- /dev/null +++ b/peft/examples/int8_training/peft_adalora_whisper_large_training.py @@ -0,0 +1,817 @@ +import argparse +import gc +import json +import logging +import math +import os +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from random import randint +from typing import Any, Union + +# datasets imports +import datasets + +# metric imports +import evaluate +import numpy as np +import torch +import transformers +import wandb + +# accelerate imports +from accelerate import Accelerator, dispatch_model +from accelerate.logging import get_logger +from datasets import Audio, DatasetDict, IterableDatasetDict, interleave_datasets, load_dataset + +# hf imports +from huggingface_hub import HfApi +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import ( + BitsAndBytesConfig, + SchedulerType, + WhisperForConditionalGeneration, + WhisperProcessor, + get_scheduler, + set_seed, +) +from transformers.models.whisper.english_normalizer import BasicTextNormalizer + +# peft imports +from peft import AdaLoraConfig, LoraConfig, PeftModel, get_peft_model + + +logger = get_logger(__name__, log_level="INFO") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Whisper Fine-Tuning with AdaLora") + parser.add_argument( + "--model_name_or_path", + type=str, + help="Path to pretrained model or model identifier from huggingface.co/models.", + required=True, + ) + parser.add_argument("--language", type=str, help="Language to use for training; e.g., 'Hindi' ", required=True) + parser.add_argument("--language_abbr", type=str, help="Language to use for training; e.g., 'hi' ", required=True) + parser.add_argument( + "--task", type=str, default="transcribe", help="Task to use for training; e.g., 'transcribe' ", required=False + ) + parser.add_argument( + "--dataset_name", + type=str, + default="mozilla-foundation/common_voice_11_0", + help="Dataset to use for training; e.g., 'whisper' ", + required=False, + ) + parser.add_argument( + "--dataset_in_streaming_mode", + action="store_true", + help="Whether to use streaming mode for the dataset.", + ) + parser.add_argument( + "--do_lower_case", action="store_true", help="lowercase the transcribed text before tokenizing" + ) + parser.add_argument( + "--do_remove_punctuation", action="store_true", help="remove punctuation from the transcribed text" + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument( + "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument("--max_audio_input_length", type=float, default=30.0, help="Maximum audio length in seconds.") + parser.add_argument( + "--preprocessing_num_workers", + type=int, + default=None, + help="The number of processes to use for the preprocessing.", + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--buffer_size", + type=int, + default=5000, + help="Number of samples to prefetch in the streaming mode.", + ) + parser.add_argument( + "--dataloader_pin_memory", + action="store_true", + help="Whether or not to pin memory for the DataLoader.", + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help="Number of subprocesses to use for data loading.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") + parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], + ) + parser.add_argument( + "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--load_best_model", + action="store_true", + help="Whether to load the best model at the end of training", + ) + parser.add_argument( + "--with_tracking", + action="store_true", + help="Whether to enable experiment trackers for logging.", + ) + parser.add_argument( + "--report_to", + type=str, + default="all", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' + ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' + "Only applicable when `--with_tracking` is passed." + ), + ) + parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--logging_steps", + type=int, + default=100, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--evaluation_steps", + type=int, + default=500, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + + # lora/adalora specific args + parser.add_argument( + "--use_peft", + action="store_true", + help="Whether to use PEFT", + ) + parser.add_argument( + "--use_adalora", + action="store_true", + help="Whether to use AdaLoRA or LoRA. If set, uses AdaLoRA instead of the default LoRA.", + ) + parser.add_argument( + "--init_r", + type=int, + default=12, + help="Initial AdaLoRA rank", + ) + parser.add_argument( + "--target_r", + type=int, + default=4, + help="Target AdaLoRA rank", + ) + parser.add_argument( + "--tinit", + type=int, + default=200, + help="number of warmup steps for AdaLoRA wherein no pruning is performed", + ) + parser.add_argument( + "--tfinal", + type=int, + default=1000, + help=" fix the resulting budget distribution and fine-tune the model for tfinal steps when using AdaLoRA ", + ) + parser.add_argument( + "--delta_t", + type=int, + default=10, + help="interval of steps for AdaLoRA to update rank", + ) + parser.add_argument( + "--lora_alpha", + type=int, + default=32, + help="LORA alpha", + ) + parser.add_argument( + "--r", + type=int, + default=8, + help="LORA rank", + ) + parser.add_argument( + "--lora_dropout", + type=float, + default=0.1, + help="LORA dropout", + ) + parser.add_argument( + "--orth_reg_weight", + type=float, + default=0.5, + help="Orthogonal regularization weight", + ) + parser.add_argument( + "--debug_mode", + action="store_true", + help="Whether to use debug mode", + ) + + args = parser.parse_args() + + if args.push_to_hub: + assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + + return args + + +def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs): + if "+" in split: + # load multiple splits separated by the `+` symbol *with* streaming mode + dataset_splits = [ + load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs) + for split_name in split.split("+") + ] + # interleave multiple splits to form one dataset + interleaved_dataset = interleave_datasets(dataset_splits) + return interleaved_dataset + else: + # load a single split *with* streaming mode + dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs) + return dataset + + +def prepare_dataset_wrapper(do_lower_case, do_remove_punctuation, processor, normalizer): + def prepare_dataset(batch): + # load and (possibly) resample audio data to 16kHz + audio = batch["audio"] + + # compute log-Mel input features from input audio array + batch["input_features"] = processor.feature_extractor( + audio["array"], sampling_rate=audio["sampling_rate"] + ).input_features[0] + # compute input length of audio sample in seconds + batch["input_length"] = len(audio["array"]) / audio["sampling_rate"] + + # optional pre-processing steps + transcription = batch["sentence"] + if do_lower_case: + transcription = transcription.lower() + if do_remove_punctuation: + transcription = normalizer(transcription).strip() + + # encode target text to label ids + batch["labels"] = processor.tokenizer(transcription).input_ids + return batch + + return prepare_dataset + + +def save_model_hook(models, weights, output_dir): + for model in models: + model.save_pretrained(output_dir) + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + +def load_model_hook(models, input_dir): + while len(models) > 0: + model = models.pop() + # pop models so that they are not loaded again + PeftModel.from_pretrained(model.base_model.model, input_dir) + + +@dataclass +class DataCollatorSpeechSeq2SeqWithPadding: + processor: Any + + def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]: + # split inputs and labels since they have to be of different lengths and need different padding methods + # first treat the audio inputs by simply returning torch tensors + input_features = [{"input_features": feature["input_features"]} for feature in features] + batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") + + # get the tokenized label sequences + label_features = [{"input_ids": feature["labels"]} for feature in features] + # pad the labels to max length + labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") + + # replace padding with -100 to ignore loss correctly + labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + + # if bos token is appended in previous tokenization step, + # cut bos token here as it's append later anyways + if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item(): + labels = labels[:, 1:] + + batch["labels"] = labels + + return batch + + +def get_audio_length_processor(max_input_length): + def is_audio_in_length_range(length): + return length < max_input_length + + return is_audio_in_length_range + + +def evaluation_loop(model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator): + model.eval() + predictions = [] + references = [] + normalized_predictions = [] + normalized_references = [] + device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + for _, batch in enumerate(tqdm(eval_dataloader)): + with torch.amp.autocast(device_type=device_type): + with torch.no_grad(): + generated_tokens = ( + model.generate( + input_features=batch["input_features"], + forced_decoder_ids=forced_decoder_ids, + max_new_tokens=255, + ) + .cpu() + .numpy() + ) + labels = batch["labels"].cpu().numpy() + labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id) + decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True) + predictions.extend(decoded_preds) + references.extend(decoded_labels) + normalized_predictions.extend([normalizer(pred).strip() for pred in decoded_preds]) + normalized_references.extend([normalizer(label).strip() for label in decoded_labels]) + del generated_tokens, labels, batch + gc.collect() + wer = 100 * metric.compute(predictions=predictions, references=references) + normalized_wer = 100 * metric.compute(predictions=normalized_predictions, references=normalized_references) + eval_metrics = {"eval/wer": wer, "eval/normalized_wer": normalized_wer} + if accelerator.get_tracker("wandb"): + sample_size = min(len(predictions), 256) + ids = [randint(0, len(predictions) - 1) for p in range(0, sample_size)] + sample_predictions = [predictions[i] for i in ids] + sample_references = [references[i] for i in ids] + sample_normalized_predictions = [normalized_predictions[i] for i in ids] + sample_normalized_references = [normalized_references[i] for i in ids] + table_rows = [ + list(r) + for r in zip( + sample_predictions, sample_references, sample_normalized_predictions, sample_normalized_references + ) + ] + eval_metrics["eval_samples"] = wandb.Table( + columns=["predictions", "references", "normalized_predictions", "normalized_references"], + rows=table_rows, + ) + return eval_metrics + + +def main(): + args = parse_args() + + accelerator_kwargs = {"gradient_accumulation_steps": args.gradient_accumulation_steps} + if args.with_tracking: + accelerator_kwargs["log_with"] = args.report_to + accelerator_kwargs["project_dir"] = args.output_dir + accelerator = Accelerator(**accelerator_kwargs) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + accelerator.wait_for_everyone() + + # load dataset either in streaming mode or not + processor = WhisperProcessor.from_pretrained(args.model_name_or_path, language=args.language, task=args.task) + normalizer = BasicTextNormalizer() + prepare_dataset = prepare_dataset_wrapper(args.do_lower_case, args.do_remove_punctuation, processor, normalizer) + is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length) + data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor) + + if args.dataset_in_streaming_mode: + raw_datasets = IterableDatasetDict() + loading_method = load_streaming_dataset + else: + raw_datasets = DatasetDict() + loading_method = load_dataset + + if args.debug_mode: + train_split = "train[:100]" + test_split = "test[:10]" + else: + train_split = "train+validation" + test_split = "test" + + raw_datasets["train"] = loading_method(args.dataset_name, args.language_abbr, split=train_split) + raw_datasets["test"] = loading_method(args.dataset_name, args.language_abbr, split=test_split) + raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000)) + + logger.info("Dataset loaded: %s", raw_datasets) + logger.info(f"{raw_datasets['train'][0]}") + + vectorized_datasets = raw_datasets.map( + prepare_dataset, + remove_columns=list(next(iter(raw_datasets.values())).features), + num_proc=args.preprocessing_num_workers, + ).with_format("torch") + + if args.dataset_in_streaming_mode: + vectorized_datasets["train"] = vectorized_datasets["train"].shuffle( + buffer_size=args.buffer_size, + seed=args.seed, + ) + + # filter out audio files that are too long from the training set + is_audio_in_length_range = get_audio_length_processor(args.max_audio_input_length) + vectorized_datasets["train"] = vectorized_datasets["train"].filter( + is_audio_in_length_range, input_columns=["input_length"] + ) + + # get dataloaders + train_dataloader = DataLoader( + vectorized_datasets["train"], + batch_size=args.per_device_train_batch_size, + shuffle=True, + collate_fn=data_collator, + num_workers=args.dataloader_num_workers, + pin_memory=args.dataloader_pin_memory, + ) + eval_dataloader = DataLoader( + vectorized_datasets["test"], + batch_size=args.per_device_eval_batch_size, + collate_fn=data_collator, + num_workers=args.dataloader_num_workers, + pin_memory=args.dataloader_pin_memory, + ) + + # metric + metric = evaluate.load("wer") + + # model + model = WhisperForConditionalGeneration.from_pretrained( + args.model_name_or_path, quantization_config=BitsAndBytesConfig(load_in_8bit=True) + ) + model.config.forced_decoder_ids = None + model.config.suppress_tokens = [] + if hasattr(model, "hf_device_map") and len(set(model.hf_device_map.values()).intersection({"cpu", "disk"})) > 0: + raise ValueError("Training on CPU or disk is not supported.") + if hasattr(model, "hf_device_map") and len(set(model.hf_device_map.values())) > 1: + device_map = model.hf_device_map.copy() + # required because `labels` are on main execution device (0) while the output of `proj_out` is on other device. + # So, this leads to device mismatch error when calculation cross-entropy between logits and labels. + # Won't arise during inference as `labels` aren't supplied during that time + # instead of changing device of one of the tied modules, I have to do this for all tied modules + # else the execution device of remaining tied modules isn't changed + device_map["model.decoder.embed_tokens"] = model._hf_hook.execution_device + device_map["model.decoder.embed_positions"] = model._hf_hook.execution_device + device_map["proj_out"] = model._hf_hook.execution_device + dispatch_model(model, device_map=device_map) + + # preparing peft model + if args.use_peft: + from peft import prepare_model_for_kbit_training + + model = prepare_model_for_kbit_training(model) + + # as Whisper model uses Conv layer in encoder, checkpointing disables grad computation + # to avoid this, make the inputs trainable + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad) + + # Calculate total steps first for AdaLoRA + if args.max_train_steps is None: + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + total_steps = args.num_train_epochs * num_update_steps_per_epoch + else: + total_steps = args.max_train_steps + + # wrapping model with adalora tuner + if args.use_adalora: + config = AdaLoraConfig( + init_r=args.init_r, + target_r=args.target_r, + beta1=0.85, + beta2=0.85, + tinit=args.tinit, + tfinal=args.tfinal, + deltaT=args.delta_t, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + orth_reg_weight=args.orth_reg_weight, + total_step=total_steps, + ) + else: + config = LoraConfig( + r=args.r, + lora_alpha=args.lora_alpha, + target_modules=["q_proj", "v_proj"], + lora_dropout=args.lora_dropout, + ) + + model = get_peft_model(model, config) + model.print_trainable_parameters() + + # optimizer + optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) + + if args.max_train_steps is None: + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + else: + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # scheduler + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=args.max_train_steps, + ) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + accelerator.print(model) + + # Note here that the max steps is adjusted by the accelerator's num_processes + args.max_train_steps = math.ceil(args.max_train_steps / accelerator.num_processes) + if args.use_peft and args.use_adalora: + # Update the total_step in the config to reflect the adjusted max_train_steps + # Handle DDP case where model is wrapped + if hasattr(model, "module"): + # DDP case + model.module.base_model.peft_config["default"].total_step = args.max_train_steps + else: + # Non-DDP case + model.base_model.peft_config["default"].total_step = args.max_train_steps + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if args.with_tracking: + run_name = f"run-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}" + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers( + "Whisper PEFT Fine-Tuning", config=experiment_config, init_kwargs={"wandb": {"name": run_name}} + ) + + # saving and loading checkpoints for resuming training + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + + total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + logger.info("***** Running training *****") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) + global_step = 0 + starting_epoch = 0 + best_metric = None + resume_step = 0 + forced_decoder_ids = processor.get_decoder_prompt_ids(language=args.language, task=args.task) + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + accelerator.load_state(args.resume_from_checkpoint) + path = os.path.basename(args.resume_from_checkpoint) + training_difference = os.path.splitext(path)[0] + global_step = resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + # We need to adjust the progress bar to the current step + progress_bar.update(resume_step) + for epoch in range(starting_epoch, args.num_train_epochs): + model.train() + if args.with_tracking: + total_loss = 0 + running_loss = 0 + for step, batch in enumerate(accelerator.skip_first_batches(train_dataloader, num_batches=resume_step)): + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + + # Update the importance of low-rank matrices + # and allocate the budget accordingly. + # This is only needed for AdaLora. + # Note that this requires parameter gradients. + # Hence being called before optimizer.zero_grad(). + if args.use_peft and args.use_adalora: + # Handle DDP case where model is wrapped + if hasattr(model, "module"): + # DDP case + peft_model = model.module + else: + # Non-DDP case + peft_model = model + + # Check if rank_pattern exists before calling update_and_allocate + if ( + hasattr(peft_model, "peft_config") + and peft_model.peft_config["default"].rank_pattern is not None + and global_step >= args.tinit # Only start updating after tinit steps + ): + peft_model.update_and_allocate(global_step) + + optimizer.zero_grad() + global_step += 1 + progress_bar.update(1) + + if args.with_tracking: + step_loss = accelerator.reduce(loss.detach().clone()).item() + total_loss += step_loss + running_loss += step_loss + + if global_step % args.checkpointing_steps == 0: + output_dir = os.path.join(args.output_dir, f"step_{global_step}") + accelerator.save_state(output_dir) + + if global_step % args.logging_steps == 0: + if args.with_tracking: + accelerator.log({"train/running_loss": running_loss / args.logging_steps}, step=global_step) + running_loss = 0 + + if global_step % args.evaluation_steps == 0: + eval_metrics = evaluation_loop( + model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator + ) + if args.with_tracking: + logger.info(f"Step {global_step} eval metrics: {eval_metrics}") + accelerator.log(eval_metrics, step=global_step) + if best_metric is None or eval_metrics["eval/wer"] < best_metric: + best_metric = eval_metrics["eval/wer"] + accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint")) + model.train() + + if global_step >= args.max_train_steps: + break + + if args.with_tracking: + train_epoch_loss = total_loss / (step + 1) + logger.info(f"Epoch {epoch} train loss: {train_epoch_loss}") + accelerator.log({"epoch/train_loss": train_epoch_loss}, step=epoch) + + if args.push_to_hub and epoch <= args.num_train_epochs - 1: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process) + # evaluate the model at the end of training + eval_metrics = evaluation_loop( + model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator + ) + if args.with_tracking: + logger.info(f"Step {global_step} eval metrics: {eval_metrics}") + accelerator.log(eval_metrics, step=global_step) + if best_metric is None or eval_metrics["eval/wer"] < best_metric: + best_metric = eval_metrics["eval/wer"] + accelerator.save_state(os.path.join(args.output_dir, "best_checkpoint")) + + if accelerator.is_main_process: + processor.tokenizer.save_pretrained(args.output_dir) + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message=f"Training in progress epoch {epoch}", + run_as_future=True, + ) + + if args.load_best_model: + # load the best model + accelerator.load_state(os.path.join(args.output_dir, "best_checkpoint")) + # Handle DDP case where model is wrapped + if hasattr(model, "module"): + # DDP case + peft_model = model.module + else: + # Non-DDP case + peft_model = model + + # Only resize if rank_pattern exists + if hasattr(peft_model, "peft_config") and peft_model.peft_config["default"].rank_pattern is not None: + peft_model.resize_modules_by_rank_pattern(peft_model.peft_config["default"].rank_pattern, "default") + + eval_metrics = evaluation_loop( + model, eval_dataloader, processor, normalizer, metric, forced_decoder_ids, accelerator + ) + if args.with_tracking: + best_metrics = {"best_" + k: v for k, v in eval_metrics.items()} + accelerator.log(best_metrics, step=global_step) + + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(args.output_dir, is_main_process=accelerator.is_main_process) + if accelerator.is_main_process: + processor.tokenizer.save_pretrained(args.output_dir) + if args.push_to_hub: + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ) + + with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: + eval_metrics.pop("eval_samples") + json.dump(eval_metrics, f) + + +if __name__ == "__main__": + main() diff --git a/peft/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb b/peft/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..41c92c6166f9214f687b9a0fe729e262a61f66ae --- /dev/null +++ b/peft/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb @@ -0,0 +1,20610 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5cefac89", + "metadata": {}, + "source": [ + "# Finetuning Whisper-large-V2 on Colab using PEFT-Lora + BNB INT8 training" + ] + }, + { + "cell_type": "markdown", + "id": "090fa3ed", + "metadata": {}, + "source": [ + "In this Colab, we present a step-by-step guide on how to fine-tune Whisper for any multilingual ASR dataset using Hugging Face 🤗 Transformers and 🤗 PEFT. Using 🤗 PEFT and `bitsandbytes`, you can train the `whisper-large-v2` seamlessly on a colab with T4 GPU (16 GB VRAM). In this notebook, with most parts from [fine_tune_whisper.ipynb](https://colab.research.google.com/github/sanchit-gandhi/notebooks/blob/main/fine_tune_whisper.ipynb#scrollTo=BRdrdFIeU78w) is adapted to train using PEFT LoRA+BNB INT8.\n", + "\n", + "For more details on model, datasets and metrics, refer blog [Fine-Tune Whisper For Multilingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-whisper)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "625e47a0", + "metadata": {}, + "source": [ + "## initial Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eJrPyQM5Xhv5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eJrPyQM5Xhv5", + "outputId": "cfd6d8c9-964c-492b-b641-8e80e337f783" + }, + "outputs": [], + "source": [ + "!add-apt-repository -y ppa:jonathonf/ffmpeg-4\n", + "!apt update\n", + "!apt install -y ffmpeg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "r_Ivl7qlX0dz", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r_Ivl7qlX0dz", + "outputId": "2caa9eed-f01a-4603-a527-fe3b0b58b6e2" + }, + "outputs": [], + "source": [ + "!pip install datasets==3.6.0\n", + "!pip install git+https://github.com/huggingface/transformers\n", + "!pip install librosa\n", + "!pip install evaluate>=0.30\n", + "!pip install jiwer\n", + "!pip install gradio\n", + "!pip install -q datasets accelerate\n", + "!pip install -q git+https://github.com/bitsandbytes-foundation/bitsandbytes.git\n", + "!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main" + ] + }, + { + "cell_type": "markdown", + "id": "8a528c1a", + "metadata": {}, + "source": [ + "Linking the notebook to the Hub is straightforward - it simply requires entering your Hub authentication token when prompted. Find your Hub authentication token [here](https://huggingface.co/settings/tokens):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed0OpduhX2JF", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 303, + "referenced_widgets": [ + "c60690c2aee74763bf23115553f4e640", + "7d3d6c198e794219ab5db59f0228c8ab", + "5d5ea0207c6148769ad9f15b7b3dd92d", + "642e28d258ca4c30a5df94c5cf7e0471", + "170ee581427d4f30925dc393d124c1be", + "d5d5aa24182a4e04b3fdae1ca7fad52a", + "6dba643113a547ac9b6e121d008791d6", + "cd9bda1053a14890ad9091c63c0a0acf", + "4522666ebbcf4647b06ce81a6316fbbb", + "989b3df296504f34a62d31ca0d6d88bb", + "e8a7a34c6fb146f0b38a40f389a617fa", + "960553d142c446cd8852523887a5cc04", + "441820fb176048109e0f8f7e9519d735", + "0bb38c654e18429a8396466ebab84504", + "1acfc4a2809e41dd995817c3526650dd", + "6d5801774beb4b529b227ef2f098614e", + "dfdf23cde48c421caebb573060641d6a" + ] + }, + "id": "ed0OpduhX2JF", + "outputId": "ecc2048a-b46a-4b20-b94a-5912924feb3d" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1da5fff", + "metadata": { + "id": "e1da5fff" + }, + "outputs": [], + "source": [ + "# Select CUDA device index\n", + "import os\n", + "\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", + "model_name_or_path = \"openai/whisper-large-v2\"\n", + "language = \"Marathi\"\n", + "language_abbr = \"mr\"\n", + "task = \"transcribe\"\n", + "dataset_name = \"mozilla-foundation/common_voice_11_0\"" + ] + }, + { + "cell_type": "markdown", + "id": "805b1c56", + "metadata": {}, + "source": [ + "## Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2787582-554f-44ce-9f38-4180a5ed6b44", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "020176f5aa0a4d489d022ef5e41ef3f6", + "cbbba08d9e634560a6c0429e32166fe5", + "6d9d1609edc8471dafe653e4da32eeeb", + "435b5708e6cf487ebb1799c7b64a8218", + "931b96b21b39482298f40449424fdd34", + "79da882ea494477a8c94fdac7acd644e", + "f14847261aa247fb9561373e0495f3e5", + "ce8f306e745d4b158c58058d471de037", + "0bd122731d674cdab803a9981eb28237", + "46211764bc6641b4b693c16c90747021", + "f1884e5a392941bfa8c484496d87084c", + "6b1cdae6f5e34d1d9a0b5e7adf5183db", + "4d29aa0885214a2aa79701cc226edaef", + "5411f018d4464e839f2fb21ad3f026aa", + "5fb8e97b25b44e0dae9715eeb97acc5d", + "783d4d18627646648f8c120b728babe1", + "1a721ed289684104b0fbdc8147c2311b", + "4880366554ad4f6687a25b5a17877dcf", + "283f5528547b457698ced126c25c2a44", + "7c90db94a11e4a5aa432e664b2af4e7e", + "7f6a466819bb45e880eb989f388a7523", + "d96245da43944c4b8235e0cd02c1aa4c", + "41187d9a120448fab6c8608f226971fa", + "1d4bd11921d145c7bfd1a8ce0700a666", + "2853a43036244f54b74db99311bc580d", + "d9a85d7c76b54199bbf7646448e3458c", + "b09d153958cf4a28baad268bbda78236", + "003227997471488dae9ae26dcbff89ea", + "deb18822d58b4b60bb75460f0a5fe921", + "e6cf97ef7bc541d0b9c6de206a3a45b0", + "a4b16b5279504dd694090798f5925d65", + "564cd321c06440e9856f10f5c40c20be", + "8a91574c4b6e4745b2b65885323b4d25", + "9202065d8e6f425d88e4514dde70992d", + "f7b4ec74e2ac45bbbf0265d0363f4d9f", + "3a083523ae604362901e4e31c39fe949", + "d1bc2ce48c3e481b9059e882b5102946", + "30c688df949042ce89337643f6178230", + "2bc85a5bde9a454990d3bb7de5e3c7c1", + "2ab45b22ce3f400a81cc451b9d7c9eb8", + "cdd08679c28642a184805912c07b324e", + "90744e5529c04f18b11e00326649abe5", + "11b9c50e720a466aa92c64b254d40778", + "8f856d6bde4041149324e1f53d19c1cb", + "49545bfc91c848b8a465e13d0b45fa34", + "45a85ceb24ea444ab18a6985884be966", + "ac9c1141ca7c453f84a8ab62b2de9158", + "bd4e3eb14252470d9c8ac60a32948a72", + "75dfc66dab48405b85073317c8dff155", + "b16e3a75acd0403f865849f1de6ca654", + "78fb0d05929849ad9aa7a7ded63c7b6b", + "3fd0fd7cbbef4785b360891c12017f48", + "a9c132959dae4303bcf6015106ce3453", + "2ef66808d51d440b9998064e212df420", + "45478f2ce991441c8ceaef9acf745084", + "c1f019686c564cca87c240a75ab71ad3", + "54589117bf244027ba024ea85bd1fd77", + "7b7986ad93f64956b8d198d2cb4acb60", + "dbc1a016a69b4ad7811d2701a9520a2f", + "72f8e7de2a5d4155a9e9146f12e14b19", + "46e44f07b96d4c32bbe75bb89159f093", + "90b53e96b3f04c2993969b17547ae0d5", + "8843f70cfa1f45299a588e96e9c1159a", + "8b2e2c650f4b4bee9e763b7c59525ec8", + "fb9f013a6188463fad6db70702576c37", + "93e2efbb5da747d4b94c916153ee9706", + "c44f472624d84e16a0f380580d36ca61", + "65188c1fdba2421ba85c2cc349709600", + "0683cbffd97a4b75bd5e00a05d541fef", + "4573606592ce4b2a914ed0a70b69f9af", + "efb85d003fb54f55aa4eadf2ab8b1684", + "a235cb3d4d424efeb30901f63fc1dbe5", + "2452ab9f6f974d1d831f7c81e884d00e", + "bd7da2671a22431889fbfa2ae1e0fe2c", + "47ee1eec97cf4af58ed4f50944386a7e", + "a05426c8e5f849b7a972dc0df3cd84ef", + "1e8cdf737c93431e841e129abb541e22", + "91fc5846a32f44d5bc5fa30fdcb3c638", + "daf4005d1d334608846d0fb2fe4f837a", + "ba48d89cab9346ed92dc338779f9f828", + "6b0e0694d895445c9bf7c955a116953f", + "ecb7ffba323743c68961e294e74b337b", + "460be80f176849e0b1241e3a4fc18b74", + "74e2bcef1ce94234bbf6ba0d6488279d", + "73bde731e20d48de8ce66b9d72fb95cb", + "9afdb23ba9ee47709f194e0e92de0edf", + "621f989f46f84d73b96a37c954e92b8d", + "89bb7bbcaa194a23bf3d908a4782ccb8", + "59e79d9132964429a2a2abbdf4bdbf32", + "2e5c1a371742446e8bc416d4735c9ce3", + "e57b15cc74e7474083e87722d6acde47", + "e899060f1edc43b980fe6f3bbe13c609", + "a6963ce72cb3425791804abf1718ba90", + "41a7cc33e0bc4bc5bc04dfc72bab7a81", + "28279812acfb4272a1c28ed70aa1362d", + "451d3851e29e4efabbc2c235dea718da", + "18dfdd09a3af49f5b17cda27872d0ba3", + "aaeec2b7986d493e8d238aa14f2e5937", + "0894264041854eea960707529f3fb8c7", + "3a4965f422f14e1ca2a63e1852235507", + "e590170f306347f3a82b76a25d37b652", + "384f4d9515d1431589a3cc934bcf5ea7", + "edc24ce2510f45f8adde0a187016259f", + "c2c2608dd091493795d975f1a3cc3762", + "1dedb58d31dd43db96ddd7315bb7e2ee", + "308e1ee4593b454a84681bda10921207", + "8f52bab9ebd049d6a57686d621f407ac", + "2958a33cff794de1aa8128327bf405f1", + "1b2780d8137042449bd6779c70bf43ca", + "ddc2a8e8ef4d429f95081c4c5baf1fb3", + "ce06b2a0de6c4fb8bae36bc4d7f63270", + "c4cca1778f314ce582bd09b9b2494f82", + "3bd70d937e924f61b943acb0aaf15619", + "c81c5d3a4dc5409e95a6410e67fa9857", + "dd02d1b31bfd4b19ad626d6691a9b293", + "b1d780c721b840d7a06661a7a5e63236", + "d19402df47464044b36fb5ee4a0c1c4d", + "b1221f4e0a57482682ea4bd6ae245da3", + "64533507b97148008548e55623b6c2a3", + "33888a5cafe6495782309dae44531dd3", + "7d5ff2f1b8794bad8286e90307bd6a61", + "7a2d45b371ba47a994e4372437f51cff", + "dec9f287435e4c6b9fb1d1ead2ded576", + "db5e1bf1871546408f233f0cbc37b136", + "af44aa66372a43beb9812ad9895d8d1f", + "9b427631384c47ab89ee1352c0236afd", + "00071f8cf276478fb2740684552f1275", + "64378b1064dc4036a9a4c8813013e210", + "ed7daa32c94648d5951876229f9835b3", + "eb883a37a9cf4945bd864decb4fc87ea", + "7b190bb2bd234b7997ca041baddd511f", + "7549fd0b38364d8580f8eb1549558a33", + "5cba5542df344d54bb80dcf7be56b2ae", + "8daed385e6564c7897fb6553669a9065", + "25fcd4b584a143058266f7f3a650d69a", + "58a15b8df2d54f89828bcd205d6bf298", + "8b51633b2fe5479db0fc73cd3f0ebaea", + "d2c1704e34c34d12b99c31d64fce88cc", + "6bdce3b33872457ea67a3a3191f438ca", + "c8ba787279ea43aa97b134c134d4f183", + "6efc3be410ef4710a59dea8dbdabcdf3", + "6f9d8dc63c494c76b36074af773330ce", + "64c0bac85ee446e284ed85ec0eb5ad44", + "5162987085ce45d88233f34ca3a41ce0", + "00819c11ea23467689e78a00d89d1b09", + "65dbf3c3a80b468aa8717e97c14db6a2", + "8cb910ad08024c818b99c2e30e30b039", + "0235efbdc6a74cce8050ab1116466427", + "ee1ae17fdf4143ae8125ce5e2a7e9066", + "481ebdff3ccc40fcb7a8d848b01ae8db", + "6117f9e9718c4f4388bd1feec34578b4", + "623d0c6427964291b5c74fb18bfb4ce4", + "14bca5cf764d4464a186961ccc6bdb3d", + "75c793c91b9e4aa994c2d34ebb16f7d4", + "3502b5f8aeab4bfd89c83b99ab308b9e", + "c43169085f2949f1b8259cd0b767d121", + "4903414a865c4813a9371e8b301f1af7", + "ec7e7e3811e34b4a8c8cc31cd021cf20", + "a22db8523d44457bb96448d08129988f", + "9980bca9b1334893bf6583c325122f50", + "931fc769fbeb42bf82df6ef5f914bf48", + "e944fa694d824042845364bdba72d642", + "3785471f614f47c78e3000a303b11ca6", + "dd634585d5e64c97881b132b1d59083e", + "785f3df156b946449c492f1296656a70", + "47a25de2e10d4b55a9e6827b85e49c3a", + "898c7b43a5e94601bc093e0cde28c2ec", + "a03430c0cdcb47bfbd2ffd754074d692", + "cda6af0ca01d4053bcebe06f3c41d887", + "abe561c67f1f42b29c33d4c296221b2f", + "21ca204b93994e8790c9eb7b0722761f", + "f628cb62f5fa446eb608467c1ecea526", + "e20264d19e804f9dba3e3867ef9b31bd", + "43141dcab2324fc6a12f9b8198e10154", + "f773a61b1dba4e3eb0df36162efe9abc", + "bfc9036c9c5f4be7ab191b92d92f1352", + "0f534e081ec64883a5f88d06f93aeb00", + "3a94afa1e03544f68df7752a4f503fbc", + "32ed17ecaf7548d5a80b27eb97cd78af", + "56817cd3b11e46f187dc13e71b7ec97d", + "3d30c20af23e495999646521d39e8e66", + "bafada4a1f29442296c47018dcaedb77", + "6cd3baac550741ba866ce2cd2dcaebdf", + "bc9c3ba4bc6d4623a764d92890536935", + "719f2b2ab9eb4e348ae902d063c27e2d", + "5e09bdca5081429ab82b488a3e016fbf", + "200bc7d8c2bc494399a4560dabd64293", + "91c3b5ddf5fb4c7aa835d992b4c7b4e7", + "52ff6528abdd4a9e9b85f4b355b2391d", + "6bb6737e22bd48d786b02051d077e8cc", + "e97a0201b81446b882ba802a5a3b00e8", + "c1a89a042b044278a8666da306e6a481", + "21ce216193f346c09a04a1d64bc0cd8a", + "23ee9c5c18c64305adc55ee218afd7a9", + "1f632bdc00e84422b64c0a4b8e238f34", + "8d50d82f9d94482c9883f99ea5c7d704", + "6a2dea21e7ce4eda8953995497308327", + "2f3d1d2f1c92402cadc4cfa1b0094238", + "844caba5ebdd48189517a10705543292", + "2d667cd4bc134a44958d17ef75d86321", + "acfb7b6734884939a753fc19047bb9bc", + "73cd51c6c43b40cab25d411e7c0f6ad1", + "19c7ea1e9364437a9c0bf6b4e645f854", + "8675aad817a44ab39b097ec864669833", + "5664e1233a904f4bb4af9c5322517fab", + "0ae5110b687440e89ebebfb847985aa1", + "85509b0aafec47e5ad540abc7ae4ab7d", + "d3d7c15d53c8498e823c84fe609321dd", + "80de3739b91a45478cb6a00dcaeae756", + "7c99e16722cf4fab866732557769b921", + "00834c084ddb4f46ac29f14575a2383d", + "4ce1e695a6ab4906ba0817005cf58d55", + "eb03476353ef4568b94a3071918e72f2", + "1b8307d133bf4280845f7d3a302b3a2b", + "daee15869a92459aabcd9128526183d5", + "5ddd9e1a1fab4531930acaf08cc45c73", + "4081e141986f4abba15477a12a752ca0", + "892e150a80464f8198024587a47186ba", + "9f9e15ff2e394ee7a7776e3e7f4b7d30", + "7e264d54d38e4d02acfd47e4e533b49d", + "746302ee5d21495db5d9a13789c5256c", + "47f7ff7b213e405a822ad6fe2e5de431", + "e4810a798c0f47b6b54f84ff4ffec608", + "8e480fe546f24afebb1ea723bff84456", + "0cf925f582374cd197164625f0ddf27d", + "aa7557fbffc54ed3a9c58c1531fd93f6", + "061c90e4148b43b8ba65848ca4c1ba46", + "f617d181ffdb4b1e8d81fbb393923a6e", + "a26ddb684a07496da4290e3f6031b685", + "f20fc82bcbf245619ad4dec04d0f999d", + "8016eabfc7aa48c2bc0bf3a13919a675", + "83cb83e404a24fba8cf43610cb1696fc", + "7b70db203ee644fe81006f5d48aa426d", + "efd9d5724dbd435991052b4445c6970f", + "41f65340441f419d91e1d3841921f48a", + "58aba8edefa44c60818891a2651137be", + "e5e8f119a91944f296ff821dd7ecfb1b", + "568a85caf800461b8571e3d206854e6b", + "e2a8a379bd0d4cbdb22fbc3bbb4fdc7a", + "2c7d952f958247b681301d1c6bff4fa7", + "27649a0d303643d8985caf506ef66fc3", + "1855ae0388074d08a4763b7ac76c6948", + "7e5252f608d6468f80deb468cb2556fd", + "bce6b62300d942a1ab89a6f0ceb16d30", + "0239f7263d1a4e8b9475ae48379c068d", + "5b527e45d6a946b28047115fa6b5aa3a", + "3c5eb07f0bff43948ff1f283c3c56b0f", + "3545fcc1e9d7453099c4931f658e0bc8", + "2f314258091a430095794c3fe0aea7e3", + "0ebfa123462342e99ad81b7e5ee00eac", + "8af6bd6e8cae4964a2e372be30220bd9", + "4184f160ab6d4f58bc921fb7ba89cf60", + "575c4d8d503244d8a9f2a5709021b5b2", + "ffce35af1ad84a2c838cca55a26dd3c4", + "d21cd4878c6e49d38dd3abb2e3b3f566", + "969e3cf7f3634c3f90b5fea38c5797ca", + "78501c2a5ac84f9ca0d20bbef340fc9f", + "5b4fbd1102a84670a1eed6f3d25c0bcd", + "621880c98245427881dd5b004b480c6a", + "65c2716dd7f14afa93d3bfaebe85c44f", + "2ff541d18f5844408690be00c7259d59", + "679fa9d2a703494dbb10fbfd19879f1a", + "6717b182e5674afb90006b50dea98cae", + "0887c7aabbbe4d4e8fdc64d2e2657cc7" + ] + }, + "id": "a2787582-554f-44ce-9f38-4180a5ed6b44", + "outputId": "b1729004-591e-41c2-c206-9d0e572eb8e5" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset, DatasetDict\n", + "\n", + "common_voice = DatasetDict()\n", + "\n", + "common_voice[\"train\"] = load_dataset(dataset_name, language_abbr, split=\"train+validation\")\n", + "common_voice[\"test\"] = load_dataset(dataset_name, language_abbr, split=\"test\")\n", + "\n", + "print(common_voice)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "20ba635d-518c-47ac-97ee-3cad25f1e0ce", + "outputId": "dd81bced-f544-4d55-9669-5babb901f842" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['audio', 'sentence'],\n", + " num_rows: 3927\n", + " })\n", + " test: Dataset({\n", + " features: ['audio', 'sentence'],\n", + " num_rows: 1816\n", + " })\n", + "})\n" + ] + } + ], + "source": [ + "common_voice = common_voice.remove_columns(\n", + " [\"accent\", \"age\", \"client_id\", \"down_votes\", \"gender\", \"locale\", \"path\", \"segment\", \"up_votes\"]\n", + ")\n", + "\n", + "print(common_voice)" + ] + }, + { + "cell_type": "markdown", + "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605", + "metadata": { + "id": "2d63b2d2-f68a-4d74-b7f1-5127f6d16605" + }, + "source": [ + "## Prepare Feature Extractor, Tokenizer and Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "4d25d9919acf44a19f1b6f8fd625f808", + "b041efb27b6149fcaf206590f1c1b961", + "4668602d021a400a8b0ec88599abc85c", + "92d9732acb964601b27695041f9fbb72", + "c3ea17f7dd94462986d30b751664d77b", + "1b1dc31d9a2b4357a71119300c6d899a", + "7a24ae9d13fb4e82b1993b05f8d71d11", + "e1141861d9f44d4c95313fd432795b70", + "6ef55b5db76d4c78854fa0c27165e480", + "91b103ac79e641b190416acdeb55902c", + "e9e10f1e53b74509bfc9c0bf11502c5e" + ] + }, + "id": "bc77d7bb-f9e2-47f5-b663-30f7a4321ce5", + "outputId": "7abb2062-e755-4f1a-e88b-9b7bf2986dbf" + }, + "outputs": [], + "source": [ + "from transformers import WhisperFeatureExtractor\n", + "\n", + "feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name_or_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 209, + "referenced_widgets": [ + "f436e6a7d3014e2ca44c94455bfeace8", + "f9d6d41ffeba43cf94ec9d7a96af6617", + "bed65245e7234977874d35bf78694e35", + "c28c2fc81e0e467cbc0ded0205d1ee85", + "ea941a078b984f51b66b5f1e8f3d1d82", + "e3f1244dbe2c48bc8102f958c3df6467", + "786d3d34cfd24f81996797c55b10b443", + "3a4bb5b9cf864265af8f1a0b989dff2c", + "ac2e1977ddaf4b948c9cc24d84c08b83", + "dd339bf6baf6433e92665e30dc30b062", + "6a544d6dd5954beab32ce3089d8d2aac", + "f933fbdcc26d41b0bb294dabd0337834", + "950e74921e6042868ab6b7b9070d6f69", + "82dc91c5b065459d827863699a9710e1", + "5b7b6f08765c4c1989f945414f2c3cf4", + "735c3606df924b9297ddc05fae3e92d5", + "d5d632dd16f147e090c62aad38c45d3c", + "24191f38e3654b86a5da3576615e2229", + "4b1f4abe697948d5a1cca9b45b2a6f87", + "8527f474f1a549abafe9519e2b4bd338", + "1031dc4b5d2d45f3b14ab12dbd9bca56", + "ab40616ed9b74f438e74d403f848bed4", + "e6c2dc814c324a0c8cb744ca16707479", + "a615446a48624d0f9a009c1f6d8b1a54", + "383d6e891c5249b4b2fabb60d4900488", + "3c3214f235a54864848902f7e53662db", + "73f6e6860b64491284c9442f0deae8b6", + "6d9826a5adb847e8b4cc4486a31b52ce", + "e8b3093587e44164b0ac043414cea0fa", + "96266c5722f54eeeb682b2707b8025dd", + "f0be69583cd1410da6dbc18302d4439b", + "c24ddbaafa5f4a63b391d981a4f10354", + "629725dfcc684b1db1a57850c3bc7bc3", + "add0f175631742d49dd4695bc004e81a", + "5871a46d60f14da99e4bb8ee74405319", + "64b308019fff4095ab7f1812aab4676a", + "03f62b3f5aa64b6390b157cb3f9d2b9f", + "59657f26f4af490980b1d9bea1526e5e", + "1571fa5d8f494704949c5809f3409fa7", + "5ef218872f2646249888def0abed7149", + "03fcbc61c50c4ddc9c3684e4c085bffb", + "e4234c7c29744fc4be99b8b2ebedc9d1", + "70aa1fc14b09473e911dad3f9030b15b", + "008bead021ea416eb31dd9143d5ae5b9", + "ebd79e22ad4e4256a6d88883af2c1eef", + "c47ba0b11e074f708338863a35a78f7b", + "72b715be2c774235a21602b18d71e75a", + "ff9a0ed54bab49aca6f27bf1be66958e", + "15b5e415b62146ba96215458cf116431", + "8b0dd001d5b04647b1c480aab83d03a2", + "0af935ad4f694fb48094e6a119cbcf82", + "7cfbb542eba34459ba64b881c1040eee", + "fe48e65b2371445bbb01a8d3e9af1f67", + "b8d3539c8a454217a3b6ffab51259054", + "05e96819517e417aaf05f5f38c0c8b76", + "b600ead93bbf44a3a3fe229589c63a61", + "3d4c614fb775434fb0c5b02d46246ee0", + "344d1cc53e28411cae839a4ebba1bf58", + "a1578c0c777f4780a3fdd1635a0909d9", + "41faf55c8878475f8a986b02ad73e8ce", + "b5c8221a09df4dfdb74017d4af544b95", + "65e4b8dae1e4435cb7737a8d5dd13d91", + "6d932eadfd6a448a9a71d741bb64f428", + "dce0d285c7d947dfba9ee5bc1a6ebece", + "bde010029c374a0eb2bb942f380f0e8b", + "a91f37ce798d411ea2a33cfdb1f01251" + ] + }, + "id": "c7b07f9b-ae0e-4f89-98f0-0c50d432eab6", + "outputId": "4c094075-3f95-42db-9f4b-4db0c2fbeb91" + }, + "outputs": [], + "source": [ + "from transformers import WhisperTokenizer\n", + "\n", + "tokenizer = WhisperTokenizer.from_pretrained(model_name_or_path, language=language, task=task)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6", + "metadata": { + "id": "77d9f0c5-8607-4642-a8ac-c3ab2e223ea6" + }, + "outputs": [], + "source": [ + "from transformers import WhisperProcessor\n", + "\n", + "processor = WhisperProcessor.from_pretrained(model_name_or_path, language=language, task=task)" + ] + }, + { + "cell_type": "markdown", + "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c", + "metadata": { + "id": "381acd09-0b0f-4d04-9eb3-f028ac0e5f2c" + }, + "source": [ + "### Prepare Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 72 + }, + "id": "6e6b0ec5-0c94-4e2c-ae24-c791be1b2255", + "outputId": "1f1fe2d1-3ad2-42d4-e6f0-f0929785ae8e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f7e1ef6a2d14f20194999aad5040c5d4bb3ead1377de3e1bbc6e9dba34d18a8a/common_voice_mr_30585613.mp3', 'array': array([-1.3727526e-15, -1.2400461e-13, -1.5159097e-13, ...,\n", + " 4.7928120e-06, 3.5631349e-06, 1.6352631e-06], dtype=float32), 'sampling_rate': 48000}, 'sentence': 'आईचे आजारपण वाढत चालले, तसतशी मथीही नीट खातपीतनाशी झाली.'}\n" + ] + } + ], + "source": [ + "print(common_voice[\"train\"][0])" + ] + }, + { + "cell_type": "markdown", + "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd", + "metadata": { + "id": "5a679f05-063d-41b3-9b58-4fc9c6ccf4fd" + }, + "source": [ + "Since \n", + "our input audio is sampled at 48kHz, we need to _downsample_ it to \n", + "16kHz prior to passing it to the Whisper feature extractor, 16kHz being the sampling rate expected by the Whisper model. \n", + "\n", + "We'll set the audio inputs to the correct sampling rate using dataset's \n", + "[`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=cast_column#datasets.DatasetDict.cast_column)\n", + "method. This operation does not change the audio in-place, \n", + "but rather signals to `datasets` to resample audio samples _on the fly_ the \n", + "first time that they are loaded:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f12e2e57-156f-417b-8cfb-69221cc198e8", + "metadata": { + "id": "f12e2e57-156f-417b-8cfb-69221cc198e8" + }, + "outputs": [], + "source": [ + "from datasets import Audio\n", + "\n", + "common_voice = common_voice.cast_column(\"audio\", Audio(sampling_rate=16000))" + ] + }, + { + "cell_type": "markdown", + "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707", + "metadata": { + "id": "00382a3e-abec-4cdd-a54c-d1aaa3ea4707" + }, + "source": [ + "Re-loading the first audio sample in the Common Voice dataset will resample \n", + "it to the desired sampling rate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87122d71-289a-466a-afcf-fa354b18946b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "87122d71-289a-466a-afcf-fa354b18946b", + "outputId": "727a709a-2b21-4c54-807f-efd40ea1719c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f7e1ef6a2d14f20194999aad5040c5d4bb3ead1377de3e1bbc6e9dba34d18a8a/common_voice_mr_30585613.mp3', 'array': array([-4.4097186e-14, -9.4153831e-14, 3.4645775e-13, ...,\n", + " -7.6018655e-06, -1.8617659e-06, 4.4520480e-06], dtype=float32), 'sampling_rate': 16000}, 'sentence': 'आईचे आजारपण वाढत चालले, तसतशी मथीही नीट खातपीतनाशी झाली.'}\n" + ] + } + ], + "source": [ + "print(common_voice[\"train\"][0])" + ] + }, + { + "cell_type": "markdown", + "id": "91edc72d-08f8-4f01-899d-74e65ce441fc", + "metadata": { + "id": "91edc72d-08f8-4f01-899d-74e65ce441fc" + }, + "source": [ + "Now we can write a function to prepare our data ready for the model:\n", + "1. We load and resample the audio data by calling `batch[\"audio\"]`. As explained above, 🤗 Datasets performs any necessary resampling operations on the fly.\n", + "2. We use the feature extractor to compute the log-Mel spectrogram input features from our 1-dimensional audio array.\n", + "3. We encode the transcriptions to label ids through the use of the tokenizer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6525c478-8962-4394-a1c4-103c54cce170", + "metadata": { + "id": "6525c478-8962-4394-a1c4-103c54cce170" + }, + "outputs": [], + "source": [ + "def prepare_dataset(batch):\n", + " # load and resample audio data from 48 to 16kHz\n", + " audio = batch[\"audio\"]\n", + "\n", + " # compute log-Mel input features from input audio array\n", + " batch[\"input_features\"] = feature_extractor(audio[\"array\"], sampling_rate=audio[\"sampling_rate\"]).input_features[0]\n", + "\n", + " # encode target text to label ids\n", + " batch[\"labels\"] = tokenizer(batch[\"sentence\"]).input_ids\n", + " return batch" + ] + }, + { + "cell_type": "markdown", + "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13", + "metadata": { + "id": "70b319fb-2439-4ef6-a70d-a47bf41c4a13" + }, + "source": [ + "We can apply the data preparation function to all of our training examples using dataset's `.map` method. The argument `num_proc` specifies how many CPU cores to use. Setting `num_proc` > 1 will enable multiprocessing. If the `.map` method hangs with multiprocessing, set `num_proc=1` and process the dataset sequentially." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 197, + "referenced_widgets": [ + "466eeda389c442e487742faff05eeb81", + "664ff40a4e7346869652ba3be663acf2", + "3d8f10b5726b46de934dd0f7ed7244e4", + "b803893882ac4358a4676964dfb3fb31", + "4ff7e3d07f6a48f8935f0f42b86fc20c", + "3e5db1000a6c4871936614e712e769db", + "21dcf39a421847a78ea685eec0983fc6", + "c4119f7ec7464aab90f17d022e674999", + "965ac01b68064dc5a9fc3bb3f244c804", + "f5d7433d15de45e997d12568cac536fc", + "14e284f308844311a9ad40415091d93f", + "58c3766293e64116a131357f6cc66fe5", + "abc8f69eae7b46b1b430cf3b7a231b05", + "979edd227f0840a4be0233082e452b5a", + "12dedd9f089c42f3a8bba9adb01cf9d3", + "3042d874fcf544fabc4701938bca7cf3", + "baa8f739bb7e402bae7ee479e282e813", + "54509d703b7d4416bdee59157f918396", + "ec6017ce2fb3431ab823bde05f977a61", + "1a540a7cba794122abdb1900061479e2", + "d046a46c70ea46ffbb04a3c9f55637d5", + "99b628071c814d88a3cd5d72e4c95f01", + "e213d1c919314315ada180d49e27dfb6", + "0ea1f163e4174684bd6efc2e2433c1d3", + "09511a81a89d4754897a3507a84405be", + "8338339ab8a242c1a485bce8558f9c39", + "8e84abf61e3e45d58efc7ccd0bfe8d37", + "47a5e0cfb3564c16acaedb88613b9020", + "09e357a7187044b0bd2b841893a2601f", + "65e8c45d05be4d41b136439d9785a09c", + "1442a54fad7d44ddbafdacaf7f95279a", + "4bea6b0455ae4019b33d45491dbf4584", + "9c928a14371a4d9aa521953e287fff54", + "b8acdb71c3564972a6c8b27e964ec061", + "688f552e85714fe6a5d3eda82be0106a", + "a8aac44a077040bd9f4638c0c8f7a877", + "89f8ede64593475da4e056e9907f370f", + "851902cb2b0e494998684409d82dafd1", + "328f4d11886d48f49d4578e8e352097f", + "a25ef41450eb42ff9b8b618b40080ac0", + "04144ed3ff5f423f99179702cee5343f", + "8c41e1ee1a2c49b8b3d3d320fbf26262", + "4edf0948ed1346ceba1c0148e9c6fe1c", + "40f2b2ff75ca4562808fc2c7a870901e" + ] + }, + "id": "7b73ab39-ffaf-4b9e-86e5-782963c6134b", + "outputId": "eecac4c8-c5f8-427b-a2ba-0d51fae825ac" + }, + "outputs": [], + "source": [ + "common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names[\"train\"], num_proc=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4be572c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "c4be572c", + "outputId": "0383124a-d4b1-4abe-a8a1-868ce6b3884e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input_features', 'labels'],\n", + " num_rows: 3927\n", + "})" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "common_voice[\"train\"]" + ] + }, + { + "cell_type": "markdown", + "id": "263a5a58-0239-4a25-b0df-c625fc9c5810", + "metadata": { + "id": "263a5a58-0239-4a25-b0df-c625fc9c5810" + }, + "source": [ + "## Training and Evaluation" + ] + }, + { + "cell_type": "markdown", + "id": "8d230e6d-624c-400a-bbf5-fa660881df25", + "metadata": { + "id": "8d230e6d-624c-400a-bbf5-fa660881df25" + }, + "source": [ + "### Define a Data Collator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5", + "metadata": { + "id": "8326221e-ec13-4731-bb4e-51e5fc1486c5" + }, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "from dataclasses import dataclass\n", + "from typing import Any, Dict, List, Union\n", + "\n", + "\n", + "@dataclass\n", + "class DataCollatorSpeechSeq2SeqWithPadding:\n", + " processor: Any\n", + "\n", + " def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:\n", + " # split inputs and labels since they have to be of different lengths and need different padding methods\n", + " # first treat the audio inputs by simply returning torch tensors\n", + " input_features = [{\"input_features\": feature[\"input_features\"]} for feature in features]\n", + " batch = self.processor.feature_extractor.pad(input_features, return_tensors=\"pt\")\n", + "\n", + " # get the tokenized label sequences\n", + " label_features = [{\"input_ids\": feature[\"labels\"]} for feature in features]\n", + " # pad the labels to max length\n", + " labels_batch = self.processor.tokenizer.pad(label_features, return_tensors=\"pt\")\n", + "\n", + " # replace padding with -100 to ignore loss correctly\n", + " labels = labels_batch[\"input_ids\"].masked_fill(labels_batch.attention_mask.ne(1), -100)\n", + "\n", + " # if bos token is appended in previous tokenization step,\n", + " # cut bos token here as it's append later anyways\n", + " if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():\n", + " labels = labels[:, 1:]\n", + "\n", + " batch[\"labels\"] = labels\n", + "\n", + " return batch" + ] + }, + { + "cell_type": "markdown", + "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86", + "metadata": { + "id": "3cae7dbf-8a50-456e-a3a8-7fd005390f86" + }, + "source": [ + "Let's initialise the data collator we've just defined:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc834702-c0d3-4a96-b101-7b87be32bf42", + "metadata": { + "id": "fc834702-c0d3-4a96-b101-7b87be32bf42" + }, + "outputs": [], + "source": [ + "data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)" + ] + }, + { + "cell_type": "markdown", + "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698", + "metadata": { + "id": "d62bb2ab-750a-45e7-82e9-61d6f4805698" + }, + "source": [ + "### Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "66fee1a7-a44c-461e-b047-c3917221572e", + "metadata": { + "id": "66fee1a7-a44c-461e-b047-c3917221572e" + }, + "source": [ + "We'll use the word error rate (WER) metric, the 'de-facto' metric for assessing \n", + "ASR systems. For more information, refer to the WER [docs](https://huggingface.co/metrics/wer). We'll load the WER metric from 🤗 Evaluate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b22b4011-f31f-4b57-b684-c52332f92890", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "215c3486e13343e091a26d658e1030d2", + "999382fb9e764a98893bd5269261d70b", + "4e9729771294424f959bbfb6bf8b60f3", + "3d2249516ecd43a08b5fba53fddb32e8", + "681711ebd0c64a63bee6b5337ef401db", + "d37791ea2b6c4152991295ca0edb0fb7", + "730f4d93bbd94452a59de43bf3d0a266", + "048e5c87cea34014a8f8a4538f4126bd", + "fbd27061ff114846aedc99bc2d17f7a7", + "5a306409e9e045b2b936267c520f935c", + "9f0638198f544a3bbb31a3a78b7bd2c2" + ] + }, + "id": "b22b4011-f31f-4b57-b684-c52332f92890", + "outputId": "b0a08086-69b9-4ab4-97ac-dbed295f2e15" + }, + "outputs": [], + "source": [ + "import evaluate\n", + "\n", + "metric = evaluate.load(\"wer\")" + ] + }, + { + "cell_type": "markdown", + "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508", + "metadata": { + "id": "4f32cab6-31f0-4cb9-af4c-40ba0f5fc508" + }, + "source": [ + "We then simply have to define a function that takes our model \n", + "predictions and returns the WER metric. This function, called\n", + "`compute_metrics`, first replaces `-100` with the `pad_token_id`\n", + "in the `label_ids` (undoing the step we applied in the \n", + "data collator to ignore padded tokens correctly in the loss).\n", + "It then decodes the predicted and label ids to strings. Finally,\n", + "it computes the WER between the predictions and reference labels:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52", + "metadata": { + "id": "23959a70-22d0-4ffe-9fa1-72b61e75bb52" + }, + "outputs": [], + "source": [ + "def compute_metrics(pred):\n", + " pred_ids = pred.predictions\n", + " label_ids = pred.label_ids\n", + "\n", + " # replace -100 with the pad_token_id\n", + " label_ids[label_ids == -100] = tokenizer.pad_token_id\n", + "\n", + " # we do not want to group tokens when computing the metrics\n", + " pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n", + " label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n", + "\n", + " wer = 100 * metric.compute(predictions=pred_str, references=label_str)\n", + "\n", + " return {\"wer\": wer}" + ] + }, + { + "cell_type": "markdown", + "id": "daf2a825-6d9f-4a23-b145-c37c0039075b", + "metadata": { + "id": "daf2a825-6d9f-4a23-b145-c37c0039075b" + }, + "source": [ + "### Load a Pre-Trained Checkpoint" + ] + }, + { + "cell_type": "markdown", + "id": "437a97fa-4864-476b-8abc-f28b8166cfa5", + "metadata": { + "id": "437a97fa-4864-476b-8abc-f28b8166cfa5" + }, + "source": [ + "Now let's load the pre-trained Whisper `small` checkpoint. Again, this \n", + "is trivial through use of 🤗 Transformers!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 220, + "referenced_widgets": [ + "d8c1a66480204f1095ff5f6a7dd2e477", + "9dc736113ef6477d91aaf71c9969ca74", + "79521628c64b4d6f9f22e73749298693", + "c7268972f75e4824893ebe7d893a18e1", + "a87598d464174703b5f5a5eca23543f3", + "e0529b81739144db8912c2d6789e729a", + "13d0a97497274652b081cbcefb3fd17d", + "08b26adf061b48f59078bf0c0b59e643", + "ef056ad59e314089a012acaa73a54e4f", + "7935e298049f4deeaeb278ba3de92291", + "3afca90970fc4925a05a9a1aa5c8d2f2", + "40aba44ef0a74c0d9a385c803c1365a6", + "ae54388d78dc4b7ebd1b21860421ffe4", + "bb4a47c63d254d4aa3220e85f86d37c4", + "cd585c98560b42c8b4a08df5b853b23c", + "a140cd385e5a4c0a88c5562370982d2e", + "fce64f5690024c698701330f0e5d039a", + "abd0e7c414974e51b280a29bf978f776", + "bf7961a79c2f403a89a4fb6d4b1a02e5", + "1d8636d1d1c3442fbbc2fc83cd03fa44", + "313090dc1f034ab19d5ffc573ea1aa5c", + "966c0400dafe4e3ea2c9baebc2e104fa", + "9087be5d992c4198b3ec2c61f4021164", + "6bc07db3471342a6bbc1b86ca734b3e6", + "4f167e9657274d56b8568d48262d1ee6", + "b7fbaa9d4bcd40b5bcf8d1475658c5b1", + "ba21f5ddf2434cc792b70a70c8c1079e", + "21bc84c704874455a57c47239984d28d", + "5b226e580df94448bed54b400c7a9a25", + "b0237154051343adaa076a9cc6dd711d", + "64756452791144859b9c803886c6dc77", + "f027b4358c2c41a8a93c617641135bcd", + "42cb6114956c4a86980c8dafd5a734ae" + ] + }, + "id": "5a10cc4b-07ec-4ebd-ac1d-7c601023594f", + "outputId": "163d4b39-7e5d-4126-8d78-846c5d94dca6" + }, + "outputs": [], + "source": [ + "from transformers import WhisperForConditionalGeneration, BitsAndBytesConfig\n", + "\n", + "model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path, quantization_config=BitsAndBytesConfig(load_in_8bit=True))\n", + "\n", + "# model.hf_device_map - this should be {\" \": 0}" + ] + }, + { + "cell_type": "markdown", + "id": "a15ead5f-2277-4a39-937b-585c2497b2df", + "metadata": { + "id": "a15ead5f-2277-4a39-937b-585c2497b2df" + }, + "source": [ + "Override generation arguments - no tokens are forced as decoder outputs (see [`forced_decoder_ids`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.forced_decoder_ids)), no tokens are suppressed during generation (see [`suppress_tokens`](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate.suppress_tokens)):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62038ba3-88ed-4fce-84db-338f50dcd04f", + "metadata": { + "id": "62038ba3-88ed-4fce-84db-338f50dcd04f" + }, + "outputs": [], + "source": [ + "model.config.forced_decoder_ids = None\n", + "model.config.suppress_tokens = []" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bR-_yaEOPsfQ", + "metadata": { + "id": "bR-_yaEOPsfQ" + }, + "source": [ + "### Post-processing on the model\n", + "\n", + "Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast all non `int8` layers in `float32` for stability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Cl_ZQualPt9R", + "metadata": { + "id": "Cl_ZQualPt9R" + }, + "outputs": [], + "source": [ + "from peft import prepare_model_for_kbit_training\n", + "\n", + "model = prepare_model_for_kbit_training(model)" + ] + }, + { + "cell_type": "markdown", + "id": "Vjl4j4RJPmPR", + "metadata": { + "id": "Vjl4j4RJPmPR" + }, + "source": [ + "### Apply LoRA\n", + "\n", + "Here comes the magic with `peft`! Let's load a `PeftModel` and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from `peft`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "DQtpDPRHPyOL", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DQtpDPRHPyOL", + "outputId": "1effcbde-7acc-4f62-f24b-e6236a43f833" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 15728640 || all params: 1559033600 || trainable%: 1.0088711365810203\n" + ] + } + ], + "source": [ + "from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model\n", + "\n", + "config = LoraConfig(r=32, lora_alpha=64, target_modules=[\"q_proj\", \"v_proj\"], lora_dropout=0.05, bias=\"none\")\n", + "\n", + "model = get_peft_model(model, config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "3906d436", + "metadata": {}, + "source": [ + "We are ONLY using **1%** of the total trainable parameters, thereby performing **Parameter-Efficient Fine-Tuning**" + ] + }, + { + "cell_type": "markdown", + "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06", + "metadata": { + "id": "2178dea4-80ca-47b6-b6ea-ba1915c90c06" + }, + "source": [ + "### Define the Training Configuration" + ] + }, + { + "cell_type": "markdown", + "id": "c21af1e9-0188-4134-ac82-defc7bdcc436", + "metadata": { + "id": "c21af1e9-0188-4134-ac82-defc7bdcc436" + }, + "source": [ + "In the final step, we define all the parameters related to training. For more detail on the training arguments, refer to the Seq2SeqTrainingArguments [docs](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a", + "metadata": { + "id": "0ae3e9af-97b7-4aa0-ae85-20b23b5bcb3a" + }, + "outputs": [], + "source": [ + "from transformers import Seq2SeqTrainingArguments\n", + "\n", + "training_args = Seq2SeqTrainingArguments(\n", + " output_dir=\"temp\", # change to a repo name of your choice\n", + " per_device_train_batch_size=8,\n", + " gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size\n", + " learning_rate=1e-3,\n", + " warmup_steps=50,\n", + " num_train_epochs=3,\n", + " eval_strategy=\"epoch\",\n", + " fp16=True,\n", + " per_device_eval_batch_size=8,\n", + " generation_max_length=128,\n", + " logging_steps=25,\n", + " remove_unused_columns=False, # required as the PeftModel forward doesn't have the signature of the wrapped model's forward\n", + " label_names=[\"labels\"], # same reason as above\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b3a944d8-3112-4552-82a0-be25988b3857", + "metadata": { + "id": "b3a944d8-3112-4552-82a0-be25988b3857" + }, + "source": [ + "**Few Important Notes:**\n", + "1. `remove_unused_columns=False` and `label_names=[\"labels\"]` are required as the PeftModel's forward doesn't have the signature of the base model's forward.\n", + "\n", + "2. INT8 training required autocasting. `predict_with_generate` can't be passed to Trainer because it internally calls transformer's `generate` without autocasting leading to errors. \n", + "\n", + "3. Because of point 2, `compute_metrics` shouldn't be passed to `Seq2SeqTrainer` as seen below. (commented out)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d546d7fe-0543-479a-b708-2ebabec19493", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "d546d7fe-0543-479a-b708-2ebabec19493", + "outputId": "e2fabe64-2c50-42ff-a7ca-7773813e9408" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The model is loaded in 8-bit precision. To train this model you need to add additional modules inside the model such as adapters using `peft` library and freeze the model weights. Please check the examples in https://github.com/huggingface/peft for more details.\n", + "Using cuda_amp half precision backend\n" + ] + } + ], + "source": [ + "from transformers import Seq2SeqTrainer, TrainerCallback, TrainingArguments, TrainerState, TrainerControl\n", + "from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR\n", + "\n", + "\n", + "class SavePeftModelCallback(TrainerCallback):\n", + " def on_save(\n", + " self,\n", + " args: TrainingArguments,\n", + " state: TrainerState,\n", + " control: TrainerControl,\n", + " **kwargs,\n", + " ):\n", + " checkpoint_folder = os.path.join(args.output_dir, f\"{PREFIX_CHECKPOINT_DIR}-{state.global_step}\")\n", + "\n", + " peft_model_path = os.path.join(checkpoint_folder, \"adapter_model\")\n", + " kwargs[\"model\"].save_pretrained(peft_model_path)\n", + "\n", + " pytorch_model_path = os.path.join(checkpoint_folder, \"pytorch_model.bin\")\n", + " if os.path.exists(pytorch_model_path):\n", + " os.remove(pytorch_model_path)\n", + " return control\n", + "\n", + "\n", + "trainer = Seq2SeqTrainer(\n", + " args=training_args,\n", + " model=model,\n", + " train_dataset=common_voice[\"train\"],\n", + " eval_dataset=common_voice[\"test\"],\n", + " data_collator=data_collator,\n", + " # compute_metrics=compute_metrics,\n", + " processing_class=processor.feature_extractor,\n", + " callbacks=[SavePeftModelCallback],\n", + ")\n", + "model.config.use_cache = False # silence the warnings. Please re-enable for inference!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ee8b7b8e-1c9a-4d77-9137-1778a629e6de", + "outputId": "cdea5268-f33a-4d48-ea4a-a9c71576f81d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/transformers/optimization.py:346: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n", + "***** Running training *****\n", + " Num examples = 3927\n", + " Num Epochs = 3\n", + " Instantaneous batch size per device = 8\n", + " Total train batch size (w. parallel, distributed & accumulation) = 8\n", + " Gradient Accumulation steps = 1\n", + " Total optimization steps = 1473\n", + " Number of trainable parameters = 15728640\n", + "/usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", + " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [1473/1473 3:20:30, Epoch 3/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss
10.2558000.262023
20.1665000.221193

\n", + "

\n", + " \n", + " \n", + " [ 62/227 04:52 < 13:11, 0.21 it/s]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "***** Running Evaluation *****\n", + " Num examples = 1816\n", + " Batch size = 8\n", + "Saving model checkpoint to ./whisper-small-hi/checkpoint-500\n", + "Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n", + "Feature extractor saved in ./whisper-small-hi/checkpoint-500/preprocessor_config.json\n", + "/usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", + " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "***** Running Evaluation *****\n", + " Num examples = 1816\n", + " Batch size = 8\n", + "Saving model checkpoint to ./whisper-small-hi/checkpoint-1000\n", + "Trainer.model is not a `PreTrainedModel`, only saving its state dict.\n", + "Feature extractor saved in ./whisper-small-hi/checkpoint-1000/preprocessor_config.json\n", + "/usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", + " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", + "/usr/local/lib/python3.8/dist-packages/bitsandbytes/autograd/_functions.py:298: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n", + " warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n", + "***** Running Evaluation *****\n", + " Num examples = 1816\n", + " Batch size = 8\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [1473/1473 3:38:43, Epoch 3/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EpochTraining LossValidation Loss
10.2558000.262023
20.1665000.221193
30.0839000.215908

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=1473, training_loss=0.20080567288382556, metrics={'train_runtime': 13136.6638, 'train_samples_per_second': 0.897, 'train_steps_per_second': 0.112, 'total_flos': 2.52799085113344e+19, 'train_loss': 0.20080567288382556, 'epoch': 3.0})" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0576aa2a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 116, + "referenced_widgets": [ + "f309d7a096df4f119e6e6871b56913f1", + "5f283548f34848af90affe55a169b5a9", + "51a4d85c08d745bda70ba0db731dca68", + "eb46a602b3ef484daddbcb847957c3e5", + "8e2e49c6046e4dc0a0a810b4e58f80cc", + "f3f191968f724e9bbb710b86d3657a66", + "d97f77f45e1e400494c2fbf2cd9d69a3", + "10e52425e59d4c2c8d2a1239b24e5a95", + "9ca0667998cb443d9df29fdd09cf90ff", + "f22aa0a93d8b44d0a4c412343ec1f48b", + "6e6a59f8e7454c2d886634eade47a21f", + "92ae14b97e814d51b016e2a14f227a15", + "fb8aca596a1c4ac4a7428248b6c6f8b1", + "31de6bfcde21400ab02af4fb31d409da", + "3c45a19fdf664b92a27be16abf537a03", + "9827e382bf3b49b092969d5656dcad7a", + "168e19229aa5404bb151e4547bb31283", + "d140f5373bf144d1ae5d282e1a65647e", + "fa137c931d2c4e579c27893ca8ee1848", + "93647f2d98a74109bfc5ca7a9cc23eed", + "a478b23c34654615aad4202b8c7089f6", + "4553ba4c0ed645dbb9ddfcc88975ab5f" + ] + }, + "id": "0576aa2a", + "outputId": "6d3fe4aa-f42a-4428-e070-2272f3b87f5c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading the following files to smangrul/openai-whisper-large-v2-LORA-colab: adapter_model.bin,adapter_config.json\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f309d7a096df4f119e6e6871b56913f1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 1 LFS files: 0%| | 0/1 [00:00
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. " + } + }, + "7d5ff2f1b8794bad8286e90307bd6a61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e2534cfd8564dbd9bddaa2217f2dda4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e264d54d38e4d02acfd47e4e533b49d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7e5252f608d6468f80deb468cb2556fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bce6b62300d942a1ab89a6f0ceb16d30", + "IPY_MODEL_0239f7263d1a4e8b9475ae48379c068d", + "IPY_MODEL_5b527e45d6a946b28047115fa6b5aa3a" + ], + "layout": "IPY_MODEL_3c5eb07f0bff43948ff1f283c3c56b0f" + } + }, + "7f6a466819bb45e880eb989f388a7523": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7fbb2599863f45f49ad4925d75ef5f77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8016eabfc7aa48c2bc0bf3a13919a675": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "80de3739b91a45478cb6a00dcaeae756": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "812f78ca88234efa9b2ccc001245290c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "82dc91c5b065459d827863699a9710e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4b1f4abe697948d5a1cca9b45b2a6f87", + "max": 1036558, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8527f474f1a549abafe9519e2b4bd338", + "value": 1036558 + } + }, + "830245f63f1947e18b29986d9a091c0a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0efdb97594b4997a8cb148aa03f9a6b", + "placeholder": "​", + "style": "IPY_MODEL_66bdd68e31204abc92fafc0e29860d1c", + "value": " 829/829 [00:00<00:00, 42.4kB/s]" + } + }, + "8338339ab8a242c1a485bce8558f9c39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4bea6b0455ae4019b33d45491dbf4584", + "placeholder": "​", + "style": "IPY_MODEL_9c928a14371a4d9aa521953e287fff54", + "value": " 908/908 [03:07<00:00, 3.87ex/s]" + } + }, + "83a7cd2ce4074720acc6236aaf24014d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "83cb83e404a24fba8cf43610cb1696fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7b70db203ee644fe81006f5d48aa426d", + "IPY_MODEL_efd9d5724dbd435991052b4445c6970f", + "IPY_MODEL_41f65340441f419d91e1d3841921f48a" + ], + "layout": "IPY_MODEL_58aba8edefa44c60818891a2651137be" + } + }, + "844caba5ebdd48189517a10705543292": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2d667cd4bc134a44958d17ef75d86321", + "IPY_MODEL_acfb7b6734884939a753fc19047bb9bc", + "IPY_MODEL_73cd51c6c43b40cab25d411e7c0f6ad1" + ], + "layout": "IPY_MODEL_19c7ea1e9364437a9c0bf6b4e645f854" + } + }, + "845211300fbd49f480d056bba9de83e1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "851902cb2b0e494998684409d82dafd1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8527f474f1a549abafe9519e2b4bd338": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "85509b0aafec47e5ad540abc7ae4ab7d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "85cdb3ecd88d426a8e517b3c8c8a8307": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "86692318e7584e62b4143c1b2eca5935": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8675aad817a44ab39b097ec864669833": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87d251db4a6e495cb1d59d276e823bf2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8843f70cfa1f45299a588e96e9c1159a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "892e150a80464f8198024587a47186ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "898c7b43a5e94601bc093e0cde28c2ec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_21ca204b93994e8790c9eb7b0722761f", + "placeholder": "​", + "style": "IPY_MODEL_f628cb62f5fa446eb608467c1ecea526", + "value": "Downloading data: 100%" + } + }, + "89bb7bbcaa194a23bf3d908a4782ccb8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "89f8ede64593475da4e056e9907f370f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4edf0948ed1346ceba1c0148e9c6fe1c", + "placeholder": "​", + "style": "IPY_MODEL_40f2b2ff75ca4562808fc2c7a870901e", + "value": " 908/908 [03:10<00:00, 4.49ex/s]" + } + }, + "8a91574c4b6e4745b2b65885323b4d25": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8af6bd6e8cae4964a2e372be30220bd9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8b0dd001d5b04647b1c480aab83d03a2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b2e2c650f4b4bee9e763b7c59525ec8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8b51633b2fe5479db0fc73cd3f0ebaea": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b9324f5c0e24576a9733f30a8945edd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ad8b6c32caa4493ebadf7691ed1546d4", + "max": 2064, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_85cdb3ecd88d426a8e517b3c8c8a8307", + "value": 2064 + } + }, + "8ba244a082284f6dbac1c8f689c527a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c010800d6d945fe965fd5b9c6ab2ec4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4568923487c44041ab985ee326842d15", + "max": 358, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_352a2237356846af836779413e584fee", + "value": 358 + } + }, + "8c41e1ee1a2c49b8b3d3d320fbf26262": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8cb910ad08024c818b99c2e30e30b039": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_14bca5cf764d4464a186961ccc6bdb3d", + "placeholder": "​", + "style": "IPY_MODEL_75c793c91b9e4aa994c2d34ebb16f7d4", + "value": " 782k/782k [00:00<00:00, 972kB/s]" + } + }, + "8d50d82f9d94482c9883f99ea5c7d704": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8daed385e6564c7897fb6553669a9065": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d2c1704e34c34d12b99c31d64fce88cc", + "placeholder": "​", + "style": "IPY_MODEL_6bdce3b33872457ea67a3a3191f438ca", + "value": "Downloading data files: 100%" + } + }, + "8e2e49c6046e4dc0a0a810b4e58f80cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e480fe546f24afebb1ea723bff84456": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f20fc82bcbf245619ad4dec04d0f999d", + "placeholder": "​", + "style": "IPY_MODEL_8016eabfc7aa48c2bc0bf3a13919a675", + "value": " 1682/0 [00:14<00:00, 2245.17 examples/s]" + } + }, + "8e84abf61e3e45d58efc7ccd0bfe8d37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f52bab9ebd049d6a57686d621f407ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f856d6bde4041149324e1f53d19c1cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "90744e5529c04f18b11e00326649abe5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9087be5d992c4198b3ec2c61f4021164": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6bc07db3471342a6bbc1b86ca734b3e6", + "IPY_MODEL_4f167e9657274d56b8568d48262d1ee6", + "IPY_MODEL_b7fbaa9d4bcd40b5bcf8d1475658c5b1" + ], + "layout": "IPY_MODEL_ba21f5ddf2434cc792b70a70c8c1079e" + } + }, + "90b53e96b3f04c2993969b17547ae0d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "91b103ac79e641b190416acdeb55902c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91c3b5ddf5fb4c7aa835d992b4c7b4e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_52ff6528abdd4a9e9b85f4b355b2391d", + "IPY_MODEL_6bb6737e22bd48d786b02051d077e8cc", + "IPY_MODEL_e97a0201b81446b882ba802a5a3b00e8" + ], + "layout": "IPY_MODEL_c1a89a042b044278a8666da306e6a481" + } + }, + "91fc5846a32f44d5bc5fa30fdcb3c638": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_daf4005d1d334608846d0fb2fe4f837a", + "IPY_MODEL_ba48d89cab9346ed92dc338779f9f828", + "IPY_MODEL_6b0e0694d895445c9bf7c955a116953f" + ], + "layout": "IPY_MODEL_ecb7ffba323743c68961e294e74b337b" + } + }, + "9202065d8e6f425d88e4514dde70992d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f7b4ec74e2ac45bbbf0265d0363f4d9f", + "IPY_MODEL_3a083523ae604362901e4e31c39fe949", + "IPY_MODEL_d1bc2ce48c3e481b9059e882b5102946" + ], + "layout": "IPY_MODEL_30c688df949042ce89337643f6178230" + } + }, + "921fd273a7254447a93fb997773aedab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "925f7aad24b547cc8071ee9bda713d7c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7e2534cfd8564dbd9bddaa2217f2dda4", + "placeholder": "​", + "style": "IPY_MODEL_a5bd56eb69524729afdfdd14b55b6130", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "92ae14b97e814d51b016e2a14f227a15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fb8aca596a1c4ac4a7428248b6c6f8b1", + "IPY_MODEL_31de6bfcde21400ab02af4fb31d409da", + "IPY_MODEL_3c45a19fdf664b92a27be16abf537a03" + ], + "layout": "IPY_MODEL_9827e382bf3b49b092969d5656dcad7a" + } + }, + "92d9732acb964601b27695041f9fbb72": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91b103ac79e641b190416acdeb55902c", + "placeholder": "​", + "style": "IPY_MODEL_e9e10f1e53b74509bfc9c0bf11502c5e", + "value": " 185k/185k [00:00<00:00, 656kB/s]" + } + }, + "92debeaea5cc4136bb37703947e825e0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7aca78c9768a48a7ab0a8220a3b364b9", + "placeholder": "​", + "style": "IPY_MODEL_7213f79093f74df996f8dbe7fe816ea0", + "value": " 63.1M/63.1M [00:00<00:00, 108MB/s]" + } + }, + "931b96b21b39482298f40449424fdd34": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "931fc769fbeb42bf82df6ef5f914bf48": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "93647f2d98a74109bfc5ca7a9cc23eed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "93e2efbb5da747d4b94c916153ee9706": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "950e74921e6042868ab6b7b9070d6f69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5d632dd16f147e090c62aad38c45d3c", + "placeholder": "​", + "style": "IPY_MODEL_24191f38e3654b86a5da3576615e2229", + "value": "Downloading (…)olve/main/vocab.json: 100%" + } + }, + "960553d142c446cd8852523887a5cc04": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "96266c5722f54eeeb682b2707b8025dd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "965ac01b68064dc5a9fc3bb3f244c804": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "966c0400dafe4e3ea2c9baebc2e104fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "969e3cf7f3634c3f90b5fea38c5797ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ff541d18f5844408690be00c7259d59", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_679fa9d2a703494dbb10fbfd19879f1a", + "value": 1 + } + }, + "96c0b77f9a5e473b8b5dc92bde0f4a9c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "979edd227f0840a4be0233082e452b5a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ec6017ce2fb3431ab823bde05f977a61", + "max": 1963, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1a540a7cba794122abdb1900061479e2", + "value": 1963 + } + }, + "980e87bb43d54306aba5019964857b89": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9827e382bf3b49b092969d5656dcad7a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "989b3df296504f34a62d31ca0d6d88bb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9980bca9b1334893bf6583c325122f50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "999382fb9e764a98893bd5269261d70b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d37791ea2b6c4152991295ca0edb0fb7", + "placeholder": "​", + "style": "IPY_MODEL_730f4d93bbd94452a59de43bf3d0a266", + "value": "Downloading builder script: 100%" + } + }, + "99b628071c814d88a3cd5d72e4c95f01": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9abe7ed3e3d347b6bad7d1252bde226f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9afdb23ba9ee47709f194e0e92de0edf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9b427631384c47ab89ee1352c0236afd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9c928a14371a4d9aa521953e287fff54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9ca0667998cb443d9df29fdd09cf90ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9dc736113ef6477d91aaf71c9969ca74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0529b81739144db8912c2d6789e729a", + "placeholder": "​", + "style": "IPY_MODEL_13d0a97497274652b081cbcefb3fd17d", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "9f0638198f544a3bbb31a3a78b7bd2c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9f9e15ff2e394ee7a7776e3e7f4b7d30": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a03430c0cdcb47bfbd2ffd754074d692": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e20264d19e804f9dba3e3867ef9b31bd", + "max": 615674, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_43141dcab2324fc6a12f9b8198e10154", + "value": 615674 + } + }, + "a05426c8e5f849b7a972dc0df3cd84ef": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a140cd385e5a4c0a88c5562370982d2e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a1578c0c777f4780a3fdd1635a0909d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bde010029c374a0eb2bb942f380f0e8b", + "placeholder": "​", + "style": "IPY_MODEL_a91f37ce798d411ea2a33cfdb1f01251", + "value": " 2.06k/2.06k [00:00<00:00, 75.3kB/s]" + } + }, + "a22db8523d44457bb96448d08129988f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a235cb3d4d424efeb30901f63fc1dbe5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a25ef41450eb42ff9b8b618b40080ac0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a26ddb684a07496da4290e3f6031b685": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a478b23c34654615aad4202b8c7089f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4b16b5279504dd694090798f5925d65": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "a5bd56eb69524729afdfdd14b55b6130": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a615446a48624d0f9a009c1f6d8b1a54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6d9826a5adb847e8b4cc4486a31b52ce", + "placeholder": "​", + "style": "IPY_MODEL_e8b3093587e44164b0ac043414cea0fa", + "value": "Downloading (…)olve/main/merges.txt: 100%" + } + }, + "a6963ce72cb3425791804abf1718ba90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a87598d464174703b5f5a5eca23543f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a8aac44a077040bd9f4638c0c8f7a877": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04144ed3ff5f423f99179702cee5343f", + "max": 908, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8c41e1ee1a2c49b8b3d3d320fbf26262", + "value": 908 + } + }, + "a91f37ce798d411ea2a33cfdb1f01251": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a9c132959dae4303bcf6015106ce3453": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "aa7557fbffc54ed3a9c58c1531fd93f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aaeec2b7986d493e8d238aa14f2e5937": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ab40616ed9b74f438e74d403f848bed4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ab4ae16fa4f448838f677ea60523c905": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "abc8f69eae7b46b1b430cf3b7a231b05": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_baa8f739bb7e402bae7ee479e282e813", + "placeholder": "​", + "style": "IPY_MODEL_54509d703b7d4416bdee59157f918396", + "value": "#1: 100%" + } + }, + "abd0e7c414974e51b280a29bf978f776": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "abe561c67f1f42b29c33d4c296221b2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ac2e1977ddaf4b948c9cc24d84c08b83": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ac9c1141ca7c453f84a8ab62b2de9158": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3fd0fd7cbbef4785b360891c12017f48", + "max": 12179, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a9c132959dae4303bcf6015106ce3453", + "value": 12179 + } + }, + "acfb7b6734884939a753fc19047bb9bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0ae5110b687440e89ebebfb847985aa1", + "max": 5, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_85509b0aafec47e5ad540abc7ae4ab7d", + "value": 5 + } + }, + "ad8b6c32caa4493ebadf7691ed1546d4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "add0f175631742d49dd4695bc004e81a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5871a46d60f14da99e4bb8ee74405319", + "IPY_MODEL_64b308019fff4095ab7f1812aab4676a", + "IPY_MODEL_03f62b3f5aa64b6390b157cb3f9d2b9f" + ], + "layout": "IPY_MODEL_59657f26f4af490980b1d9bea1526e5e" + } + }, + "ae54388d78dc4b7ebd1b21860421ffe4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fce64f5690024c698701330f0e5d039a", + "placeholder": "​", + "style": "IPY_MODEL_abd0e7c414974e51b280a29bf978f776", + "value": "Downloading (…)"pytorch_model.bin";: 100%" + } + }, + "af44aa66372a43beb9812ad9895d8d1f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b190bb2bd234b7997ca041baddd511f", + "placeholder": "​", + "style": "IPY_MODEL_7549fd0b38364d8580f8eb1549558a33", + "value": " 5/5 [00:03<00:00, 1.15it/s]" + } + }, + "af56501692b84e718fe3e8b1e452b7c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e53d35abac8241bcaf6d3b0f732d1bb9", + "max": 184990, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_87d251db4a6e495cb1d59d276e823bf2", + "value": 184990 + } + }, + "b0237154051343adaa076a9cc6dd711d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b041efb27b6149fcaf206590f1c1b961": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b1dc31d9a2b4357a71119300c6d899a", + "placeholder": "​", + "style": "IPY_MODEL_7a24ae9d13fb4e82b1993b05f8d71d11", + "value": "Downloading (…)rocessor_config.json: 100%" + } + }, + "b09d153958cf4a28baad268bbda78236": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0e084a4c551427bb176061df894fdf6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0efdb97594b4997a8cb148aa03f9a6b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1183b9042744d3fa7437e2ec55b6cdd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c643b8650c31466a8614510928ab5d2e", + "placeholder": "​", + "style": "IPY_MODEL_58e46e1d355244bcab90b32c163e2877", + "value": " 1.97k/1.97k [00:00<00:00, 31.4kB/s]" + } + }, + "b1221f4e0a57482682ea4bd6ae245da3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b16e3a75acd0403f865849f1de6ca654": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b19431ca425341568c2b3a8556431a8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6072c153de2540bf9e1227db5057fffc", + "placeholder": "​", + "style": "IPY_MODEL_b7d9e1bf1d2e48e59c4dca38a638d4b4", + "value": " 494k/494k [00:00<00:00, 3.65MB/s]" + } + }, + "b1d780c721b840d7a06661a7a5e63236": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b3a59b83acaa4864be23e10d90560139": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b8266519f0a41cd9a065b9e795d7e84", + "max": 2108, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_658361b3d5054f44a5c21df747537bdd", + "value": 2108 + } + }, + "b5c8221a09df4dfdb74017d4af544b95": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b600ead93bbf44a3a3fe229589c63a61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3d4c614fb775434fb0c5b02d46246ee0", + "IPY_MODEL_344d1cc53e28411cae839a4ebba1bf58", + "IPY_MODEL_a1578c0c777f4780a3fdd1635a0909d9" + ], + "layout": "IPY_MODEL_41faf55c8878475f8a986b02ad73e8ce" + } + }, + "b7d9e1bf1d2e48e59c4dca38a638d4b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b7fbaa9d4bcd40b5bcf8d1475658c5b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f027b4358c2c41a8a93c617641135bcd", + "placeholder": "​", + "style": "IPY_MODEL_42cb6114956c4a86980c8dafd5a734ae", + "value": " 3.49k/3.49k [00:00<00:00, 142kB/s]" + } + }, + "b803893882ac4358a4676964dfb3fb31": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f5d7433d15de45e997d12568cac536fc", + "placeholder": "​", + "style": "IPY_MODEL_14e284f308844311a9ad40415091d93f", + "value": " 1964/1964 [06:56<00:00, 4.82ex/s]" + } + }, + "b8acdb71c3564972a6c8b27e964ec061": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_688f552e85714fe6a5d3eda82be0106a", + "IPY_MODEL_a8aac44a077040bd9f4638c0c8f7a877", + "IPY_MODEL_89f8ede64593475da4e056e9907f370f" + ], + "layout": "IPY_MODEL_851902cb2b0e494998684409d82dafd1" + } + }, + "b8d3539c8a454217a3b6ffab51259054": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ba21f5ddf2434cc792b70a70c8c1079e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ba48d89cab9346ed92dc338779f9f828": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_73bde731e20d48de8ce66b9d72fb95cb", + "max": 65484800, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9afdb23ba9ee47709f194e0e92de0edf", + "value": 65484800 + } + }, + "baa8f739bb7e402bae7ee479e282e813": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bafada4a1f29442296c47018dcaedb77": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb1bcdfd39bb430c8a453d7a4c3a5e3d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb4a47c63d254d4aa3220e85f86d37c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf7961a79c2f403a89a4fb6d4b1a02e5", + "max": 6173629930, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1d8636d1d1c3442fbbc2fc83cd03fa44", + "value": 6173629930 + } + }, + "bbc5dbbb8b3548d485b878e194683cf2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c0432026ca204c4e89721fb22589c576", + "IPY_MODEL_c5672e82461148e29dc93af865e259ff", + "IPY_MODEL_f8053b94954d401a85ead90319aa4690" + ], + "layout": "IPY_MODEL_cfee6cbc1d15435bad0bc4a193542c10" + } + }, + "bc2b317065ed4b658995663ff202108a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bc9c3ba4bc6d4623a764d92890536935": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bce6b62300d942a1ab89a6f0ceb16d30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3545fcc1e9d7453099c4931f658e0bc8", + "placeholder": "​", + "style": "IPY_MODEL_2f314258091a430095794c3fe0aea7e3", + "value": "Generating other split: " + } + }, + "bd4e3eb14252470d9c8ac60a32948a72": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ef66808d51d440b9998064e212df420", + "placeholder": "​", + "style": "IPY_MODEL_45478f2ce991441c8ceaef9acf745084", + "value": " 12.2k/12.2k [00:00<00:00, 142kB/s]" + } + }, + "bd7da2671a22431889fbfa2ae1e0fe2c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bde010029c374a0eb2bb942f380f0e8b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "be90fac89e5243fb82389ed06d094c10": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_845211300fbd49f480d056bba9de83e1", + "max": 63056269, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_df4435b0da414c3880adf33a50c465e5", + "value": 63056269 + } + }, + "bed65245e7234977874d35bf78694e35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a4bb5b9cf864265af8f1a0b989dff2c", + "max": 829, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ac2e1977ddaf4b948c9cc24d84c08b83", + "value": 829 + } + }, + "bf7961a79c2f403a89a4fb6d4b1a02e5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bfc9036c9c5f4be7ab191b92d92f1352": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c0432026ca204c4e89721fb22589c576": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35e6c9e3c5ea47a89cd0cbebfd197bde", + "placeholder": "​", + "style": "IPY_MODEL_f81bffac74ff4b07a09760a00620610c", + "value": "Downloading (…)olve/main/vocab.json: 100%" + } + }, + "c1a89a042b044278a8666da306e6a481": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c1f019686c564cca87c240a75ab71ad3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_54589117bf244027ba024ea85bd1fd77", + "IPY_MODEL_7b7986ad93f64956b8d198d2cb4acb60", + "IPY_MODEL_dbc1a016a69b4ad7811d2701a9520a2f" + ], + "layout": "IPY_MODEL_72f8e7de2a5d4155a9e9146f12e14b19" + } + }, + "c24ddbaafa5f4a63b391d981a4f10354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c28c2fc81e0e467cbc0ded0205d1ee85": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd339bf6baf6433e92665e30dc30b062", + "placeholder": "​", + "style": "IPY_MODEL_6a544d6dd5954beab32ce3089d8d2aac", + "value": " 829/829 [00:00<00:00, 34.3kB/s]" + } + }, + "c2c2608dd091493795d975f1a3cc3762": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c3ea17f7dd94462986d30b751664d77b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c4119f7ec7464aab90f17d022e674999": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c43169085f2949f1b8259cd0b767d121": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9980bca9b1334893bf6583c325122f50", + "placeholder": "​", + "style": "IPY_MODEL_931fc769fbeb42bf82df6ef5f914bf48", + "value": "Downloading data: 100%" + } + }, + "c44f472624d84e16a0f380580d36ca61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_65188c1fdba2421ba85c2cc349709600", + "IPY_MODEL_0683cbffd97a4b75bd5e00a05d541fef", + "IPY_MODEL_4573606592ce4b2a914ed0a70b69f9af" + ], + "layout": "IPY_MODEL_efb85d003fb54f55aa4eadf2ab8b1684" + } + }, + "c47ba0b11e074f708338863a35a78f7b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8b0dd001d5b04647b1c480aab83d03a2", + "placeholder": "​", + "style": "IPY_MODEL_0af935ad4f694fb48094e6a119cbcf82", + "value": "Downloading (…)in/added_tokens.json: 100%" + } + }, + "c4cca1778f314ce582bd09b9b2494f82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b1d780c721b840d7a06661a7a5e63236", + "placeholder": "​", + "style": "IPY_MODEL_d19402df47464044b36fb5ee4a0c1c4d", + "value": "Downloading data: 100%" + } + }, + "c4e7cf4e42554d61aa055da47ab0ee22": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5672e82461148e29dc93af865e259ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cb55e8ca8bf4477bb323ae45a6c60500", + "max": 1036558, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3e154f89e56146438bc45030e5254d8d", + "value": 1036558 + } + }, + "c60690c2aee74763bf23115553f4e640": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7d3d6c198e794219ab5db59f0228c8ab", + "IPY_MODEL_5d5ea0207c6148769ad9f15b7b3dd92d", + "IPY_MODEL_642e28d258ca4c30a5df94c5cf7e0471", + "IPY_MODEL_170ee581427d4f30925dc393d124c1be", + "IPY_MODEL_d5d5aa24182a4e04b3fdae1ca7fad52a" + ], + "layout": "IPY_MODEL_6dba643113a547ac9b6e121d008791d6" + } + }, + "c643b8650c31466a8614510928ab5d2e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c6c9fa535ff4458ba966183e1489905b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_56e8812fa66e40b0896a19dd3be7330c", + "placeholder": "​", + "style": "IPY_MODEL_116c89bb3a2045f28123e4f36067a7e5", + "value": " 185k/185k [00:00<00:00, 2.18MB/s]" + } + }, + "c7268972f75e4824893ebe7d893a18e1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7935e298049f4deeaeb278ba3de92291", + "placeholder": "​", + "style": "IPY_MODEL_3afca90970fc4925a05a9a1aa5c8d2f2", + "value": " 1.97k/1.97k [00:00<00:00, 70.2kB/s]" + } + }, + "c78544f32b564ba6832120b3167a2162": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c80954848a4a499a87708548bed8e7cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c81c5d3a4dc5409e95a6410e67fa9857": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_33888a5cafe6495782309dae44531dd3", + "placeholder": "​", + "style": "IPY_MODEL_7d5ff2f1b8794bad8286e90307bd6a61", + "value": " 91.0M/91.0M [00:02<00:00, 43.0MB/s]" + } + }, + "c89cb73d51dd457d8c17ff97e74b7ca1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c8ba787279ea43aa97b134c134d4f183": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cb55e8ca8bf4477bb323ae45a6c60500": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cbbba08d9e634560a6c0429e32166fe5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_79da882ea494477a8c94fdac7acd644e", + "placeholder": "​", + "style": "IPY_MODEL_f14847261aa247fb9561373e0495f3e5", + "value": "Downloading builder script: 100%" + } + }, + "cd585c98560b42c8b4a08df5b853b23c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_313090dc1f034ab19d5ffc573ea1aa5c", + "placeholder": "​", + "style": "IPY_MODEL_966c0400dafe4e3ea2c9baebc2e104fa", + "value": " 6.17G/6.17G [00:31<00:00, 201MB/s]" + } + }, + "cd9bda1053a14890ad9091c63c0a0acf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cda6af0ca01d4053bcebe06f3c41d887": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f773a61b1dba4e3eb0df36162efe9abc", + "placeholder": "​", + "style": "IPY_MODEL_bfc9036c9c5f4be7ab191b92d92f1352", + "value": " 616k/616k [00:00<00:00, 1.31MB/s]" + } + }, + "cdd08679c28642a184805912c07b324e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ce06b2a0de6c4fb8bae36bc4d7f63270": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c4cca1778f314ce582bd09b9b2494f82", + "IPY_MODEL_3bd70d937e924f61b943acb0aaf15619", + "IPY_MODEL_c81c5d3a4dc5409e95a6410e67fa9857" + ], + "layout": "IPY_MODEL_dd02d1b31bfd4b19ad626d6691a9b293" + } + }, + "ce8f306e745d4b158c58058d471de037": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cfee6cbc1d15435bad0bc4a193542c10": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d046a46c70ea46ffbb04a3c9f55637d5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d140f5373bf144d1ae5d282e1a65647e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d19402df47464044b36fb5ee4a0c1c4d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d1bc2ce48c3e481b9059e882b5102946": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_11b9c50e720a466aa92c64b254d40778", + "placeholder": "​", + "style": "IPY_MODEL_8f856d6bde4041149324e1f53d19c1cb", + "value": " 60.9k/60.9k [00:00<00:00, 169kB/s]" + } + }, + "d21cd4878c6e49d38dd3abb2e3b3f566": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_621880c98245427881dd5b004b480c6a", + "placeholder": "​", + "style": "IPY_MODEL_65c2716dd7f14afa93d3bfaebe85c44f", + "value": "Generating invalidated split: " + } + }, + "d2c14b2486e24150a8242ca37f4200f4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d2c1704e34c34d12b99c31d64fce88cc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d37791ea2b6c4152991295ca0edb0fb7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d3d7c15d53c8498e823c84fe609321dd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d5d5aa24182a4e04b3fdae1ca7fad52a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6d5801774beb4b529b227ef2f098614e", + "placeholder": "​", + "style": "IPY_MODEL_dfdf23cde48c421caebb573060641d6a", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "d5d632dd16f147e090c62aad38c45d3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d65e79ed6a9f4daea04e098959f12c80": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c89cb73d51dd457d8c17ff97e74b7ca1", + "max": 3486, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_921fd273a7254447a93fb997773aedab", + "value": 3486 + } + }, + "d8c1a66480204f1095ff5f6a7dd2e477": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9dc736113ef6477d91aaf71c9969ca74", + "IPY_MODEL_79521628c64b4d6f9f22e73749298693", + "IPY_MODEL_c7268972f75e4824893ebe7d893a18e1" + ], + "layout": "IPY_MODEL_a87598d464174703b5f5a5eca23543f3" + } + }, + "d92259acf9704ef9be6eced8a5f25dab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d96245da43944c4b8235e0cd02c1aa4c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d97f77f45e1e400494c2fbf2cd9d69a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d9a85d7c76b54199bbf7646448e3458c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_564cd321c06440e9856f10f5c40c20be", + "placeholder": "​", + "style": "IPY_MODEL_8a91574c4b6e4745b2b65885323b4d25", + "value": " 3.44k/3.44k [00:00<00:00, 71.6kB/s]" + } + }, + "daee15869a92459aabcd9128526183d5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "daf4005d1d334608846d0fb2fe4f837a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_460be80f176849e0b1241e3a4fc18b74", + "placeholder": "​", + "style": "IPY_MODEL_74e2bcef1ce94234bbf6ba0d6488279d", + "value": "Downloading data: 100%" + } + }, + "db5e1bf1871546408f233f0cbc37b136": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed7daa32c94648d5951876229f9835b3", + "max": 5, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eb883a37a9cf4945bd864decb4fc87ea", + "value": 5 + } + }, + "dbc1a016a69b4ad7811d2701a9520a2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fb9f013a6188463fad6db70702576c37", + "placeholder": "​", + "style": "IPY_MODEL_93e2efbb5da747d4b94c916153ee9706", + "value": " 5/5 [00:24<00:00, 5.01s/it]" + } + }, + "dbe19b9505884a958e832c0362d547df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6a40bc08e55a4605b478edf01ff088df", + "placeholder": "​", + "style": "IPY_MODEL_fbd304c2564e4acba84809377ba19e25", + "value": "Downloading (…)"pytorch_model.bin";: 100%" + } + }, + "dce0d285c7d947dfba9ee5bc1a6ebece": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dd02d1b31bfd4b19ad626d6691a9b293": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd339bf6baf6433e92665e30dc30b062": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd634585d5e64c97881b132b1d59083e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ddc2a8e8ef4d429f95081c4c5baf1fb3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "deb18822d58b4b60bb75460f0a5fe921": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dec9f287435e4c6b9fb1d1ead2ded576": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_00071f8cf276478fb2740684552f1275", + "placeholder": "​", + "style": "IPY_MODEL_64378b1064dc4036a9a4c8813013e210", + "value": "Extracting data files: 100%" + } + }, + "df4435b0da414c3880adf33a50c465e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dfdf23cde48c421caebb573060641d6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0529b81739144db8912c2d6789e729a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e1141861d9f44d4c95313fd432795b70": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e166fb45a33c495e9428bcdef8cd8813": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_098e92acdc444f64a031bd0daf070604", + "placeholder": "​", + "style": "IPY_MODEL_6feb1c05469b4bc8ac9c98cdf29fdd57", + "value": " 2.06k/2.06k [00:00<00:00, 70.6kB/s]" + } + }, + "e1fef249d35d482681cd39eefb7e0d7e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_54af0365067b498d86d23c453c5e38f1", + "max": 52666, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ab4ae16fa4f448838f677ea60523c905", + "value": 52666 + } + }, + "e20264d19e804f9dba3e3867ef9b31bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e213d1c919314315ada180d49e27dfb6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0ea1f163e4174684bd6efc2e2433c1d3", + "IPY_MODEL_09511a81a89d4754897a3507a84405be", + "IPY_MODEL_8338339ab8a242c1a485bce8558f9c39" + ], + "layout": "IPY_MODEL_8e84abf61e3e45d58efc7ccd0bfe8d37" + } + }, + "e2a8a379bd0d4cbdb22fbc3bbb4fdc7a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "e3f1244dbe2c48bc8102f958c3df6467": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4234c7c29744fc4be99b8b2ebedc9d1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e4810a798c0f47b6b54f84ff4ffec608": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f617d181ffdb4b1e8d81fbb393923a6e", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a26ddb684a07496da4290e3f6031b685", + "value": 1 + } + }, + "e53d35abac8241bcaf6d3b0f732d1bb9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e57b15cc74e7474083e87722d6acde47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_451d3851e29e4efabbc2c235dea718da", + "max": 69713920, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_18dfdd09a3af49f5b17cda27872d0ba3", + "value": 69713920 + } + }, + "e590170f306347f3a82b76a25d37b652": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1dedb58d31dd43db96ddd7315bb7e2ee", + "placeholder": "​", + "style": "IPY_MODEL_308e1ee4593b454a84681bda10921207", + "value": "Downloading data: 100%" + } + }, + "e5e8f119a91944f296ff821dd7ecfb1b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6c2dc814c324a0c8cb744ca16707479": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a615446a48624d0f9a009c1f6d8b1a54", + "IPY_MODEL_383d6e891c5249b4b2fabb60d4900488", + "IPY_MODEL_3c3214f235a54864848902f7e53662db" + ], + "layout": "IPY_MODEL_73f6e6860b64491284c9442f0deae8b6" + } + }, + "e6cf97ef7bc541d0b9c6de206a3a45b0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6dcbcdf41c34e418be0b0b86e44d3f1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e7be6d842a0c4ba485dfa8e58338eed4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e899060f1edc43b980fe6f3bbe13c609": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aaeec2b7986d493e8d238aa14f2e5937", + "placeholder": "​", + "style": "IPY_MODEL_0894264041854eea960707529f3fb8c7", + "value": " 69.7M/69.7M [00:03<00:00, 22.6MB/s]" + } + }, + "e8a7a34c6fb146f0b38a40f389a617fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e8b3093587e44164b0ac043414cea0fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e944fa694d824042845364bdba72d642": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e97a0201b81446b882ba802a5a3b00e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6a2dea21e7ce4eda8953995497308327", + "placeholder": "​", + "style": "IPY_MODEL_2f3d1d2f1c92402cadc4cfa1b0094238", + "value": " 789k/789k [00:00<00:00, 1.10MB/s]" + } + }, + "e9e10f1e53b74509bfc9c0bf11502c5e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ea941a078b984f51b66b5f1e8f3d1d82": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eb03476353ef4568b94a3071918e72f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f9e15ff2e394ee7a7776e3e7f4b7d30", + "placeholder": "​", + "style": "IPY_MODEL_7e264d54d38e4d02acfd47e4e533b49d", + "value": " 2245/0 [00:15<00:00, 2226.90 examples/s]" + } + }, + "eb0d8c3f6de3468fb2b41ddf940de999": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3547b43905c94f479248a29b60e5a1ce", + "placeholder": "​", + "style": "IPY_MODEL_02ec476d0571401eb18332d2af3a13fd", + "value": "Downloading (…)/adapter_config.json: 100%" + } + }, + "eb46a602b3ef484daddbcb847957c3e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f22aa0a93d8b44d0a4c412343ec1f48b", + "placeholder": "​", + "style": "IPY_MODEL_6e6a59f8e7454c2d886634eade47a21f", + "value": " 1/1 [00:03<00:00, 3.39s/it]" + } + }, + "eb883a37a9cf4945bd864decb4fc87ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ebd79e22ad4e4256a6d88883af2c1eef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c47ba0b11e074f708338863a35a78f7b", + "IPY_MODEL_72b715be2c774235a21602b18d71e75a", + "IPY_MODEL_ff9a0ed54bab49aca6f27bf1be66958e" + ], + "layout": "IPY_MODEL_15b5e415b62146ba96215458cf116431" + } + }, + "ec6017ce2fb3431ab823bde05f977a61": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec7e7e3811e34b4a8c8cc31cd021cf20": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd634585d5e64c97881b132b1d59083e", + "placeholder": "​", + "style": "IPY_MODEL_785f3df156b946449c492f1296656a70", + "value": " 581k/581k [00:00<00:00, 1.91MB/s]" + } + }, + "ecb7ffba323743c68961e294e74b337b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ed7daa32c94648d5951876229f9835b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ed80930617a64700a92b3bcff97c2885": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0dc05bee870740ffa72af34ace8c05d9", + "placeholder": "​", + "style": "IPY_MODEL_7a8b8f2a2160441dbdc54a62de83d297", + "value": "Downloading (…)"adapter_model.bin";: 100%" + } + }, + "edc24ce2510f45f8adde0a187016259f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1b2780d8137042449bd6779c70bf43ca", + "placeholder": "​", + "style": "IPY_MODEL_ddc2a8e8ef4d429f95081c4c5baf1fb3", + "value": " 110M/110M [00:04<00:00, 30.6MB/s]" + } + }, + "ee1ae17fdf4143ae8125ce5e2a7e9066": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef01f9efbfb749a3999c5b89ba0ee370": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ef056ad59e314089a012acaa73a54e4f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "efb85d003fb54f55aa4eadf2ab8b1684": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "efd9d5724dbd435991052b4445c6970f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "info", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2a8a379bd0d4cbdb22fbc3bbb4fdc7a", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2c7d952f958247b681301d1c6bff4fa7", + "value": 1 + } + }, + "f027b4358c2c41a8a93c617641135bcd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f09d5d5e7faf48f5adbaa90cbcf55162": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3ff0c6a4f4fc4b72a5163a7da50af4da", + "IPY_MODEL_e1fef249d35d482681cd39eefb7e0d7e", + "IPY_MODEL_0a8ca977f9db4d7794dcbcaca3c5cb96" + ], + "layout": "IPY_MODEL_e7be6d842a0c4ba485dfa8e58338eed4" + } + }, + "f0be69583cd1410da6dbc18302d4439b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f14847261aa247fb9561373e0495f3e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f1884e5a392941bfa8c484496d87084c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f20fc82bcbf245619ad4dec04d0f999d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f22aa0a93d8b44d0a4c412343ec1f48b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f309d7a096df4f119e6e6871b56913f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5f283548f34848af90affe55a169b5a9", + "IPY_MODEL_51a4d85c08d745bda70ba0db731dca68", + "IPY_MODEL_eb46a602b3ef484daddbcb847957c3e5" + ], + "layout": "IPY_MODEL_8e2e49c6046e4dc0a0a810b4e58f80cc" + } + }, + "f3f191968f724e9bbb710b86d3657a66": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f436e6a7d3014e2ca44c94455bfeace8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f9d6d41ffeba43cf94ec9d7a96af6617", + "IPY_MODEL_bed65245e7234977874d35bf78694e35", + "IPY_MODEL_c28c2fc81e0e467cbc0ded0205d1ee85" + ], + "layout": "IPY_MODEL_ea941a078b984f51b66b5f1e8f3d1d82" + } + }, + "f5d7433d15de45e997d12568cac536fc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f5ff1816a56243e4872ddfcd35331ad8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f617d181ffdb4b1e8d81fbb393923a6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "f628cb62f5fa446eb608467c1ecea526": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f773a61b1dba4e3eb0df36162efe9abc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7b4ec74e2ac45bbbf0265d0363f4d9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2bc85a5bde9a454990d3bb7de5e3c7c1", + "placeholder": "​", + "style": "IPY_MODEL_2ab45b22ce3f400a81cc451b9d7c9eb8", + "value": "Downloading extra modules: 100%" + } + }, + "f8053b94954d401a85ead90319aa4690": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9abe7ed3e3d347b6bad7d1252bde226f", + "placeholder": "​", + "style": "IPY_MODEL_8ba244a082284f6dbac1c8f689c527a8", + "value": " 1.04M/1.04M [00:00<00:00, 5.93MB/s]" + } + }, + "f81bffac74ff4b07a09760a00620610c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f933fbdcc26d41b0bb294dabd0337834": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_950e74921e6042868ab6b7b9070d6f69", + "IPY_MODEL_82dc91c5b065459d827863699a9710e1", + "IPY_MODEL_5b7b6f08765c4c1989f945414f2c3cf4" + ], + "layout": "IPY_MODEL_735c3606df924b9297ddc05fae3e92d5" + } + }, + "f93f87ef211446379963df5bbb2e4ff0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f96c7014b47d4d048ff50d6f1f2da200": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_925f7aad24b547cc8071ee9bda713d7c", + "IPY_MODEL_4043e3ceb27c435d83913b3796bd9927", + "IPY_MODEL_b1183b9042744d3fa7437e2ec55b6cdd" + ], + "layout": "IPY_MODEL_20a0694ee1684ffc8289af9094e812e1" + } + }, + "f9d6d41ffeba43cf94ec9d7a96af6617": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e3f1244dbe2c48bc8102f958c3df6467", + "placeholder": "​", + "style": "IPY_MODEL_786d3d34cfd24f81996797c55b10b443", + "value": "Downloading (…)okenizer_config.json: 100%" + } + }, + "f9e5029c15054f5a9cbcda1ed5878995": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4cc5b0a5c84e43f3bb185c63bc1f9a56", + "IPY_MODEL_af56501692b84e718fe3e8b1e452b7c2", + "IPY_MODEL_c6c9fa535ff4458ba966183e1489905b" + ], + "layout": "IPY_MODEL_b0e084a4c551427bb176061df894fdf6" + } + }, + "fa137c931d2c4e579c27893ca8ee1848": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fab643980982452988a02c14132c73ca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_062727d821f248c9958ecc2d79237e5b", + "IPY_MODEL_2c264564d4c642ebb033703b1df02c69", + "IPY_MODEL_b19431ca425341568c2b3a8556431a8f" + ], + "layout": "IPY_MODEL_441e89acb62e47dab61b32b5a97110f3" + } + }, + "fb8aca596a1c4ac4a7428248b6c6f8b1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_168e19229aa5404bb151e4547bb31283", + "placeholder": "​", + "style": "IPY_MODEL_d140f5373bf144d1ae5d282e1a65647e", + "value": "adapter_model.bin: 100%" + } + }, + "fb9f013a6188463fad6db70702576c37": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fbd27061ff114846aedc99bc2d17f7a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fbd304c2564e4acba84809377ba19e25": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fce64f5690024c698701330f0e5d039a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe48e65b2371445bbb01a8d3e9af1f67": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ff9a0ed54bab49aca6f27bf1be66958e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b8d3539c8a454217a3b6ffab51259054", + "placeholder": "​", + "style": "IPY_MODEL_05e96819517e417aaf05f5f38c0c8b76", + "value": " 2.11k/2.11k [00:00<00:00, 124kB/s]" + } + }, + "ffce35af1ad84a2c838cca55a26dd3c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d21cd4878c6e49d38dd3abb2e3b3f566", + "IPY_MODEL_969e3cf7f3634c3f90b5fea38c5797ca", + "IPY_MODEL_78501c2a5ac84f9ca0d20bbef340fc9f" + ], + "layout": "IPY_MODEL_5b4fbd1102a84670a1eed6f3d25c0bcd" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/int8_training/requirements.txt b/peft/examples/int8_training/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a53bafca0e0ab1d4a7af59c921cd1ca5f5f69aa --- /dev/null +++ b/peft/examples/int8_training/requirements.txt @@ -0,0 +1,9 @@ +accelerate +git+https://github.com/bitsandbytes-foundation/bitsandbytes.git +datasets==3.6.0 +evaluate +jiwer +librosa +soundfile +transformers==4.52.4 +wandb diff --git a/peft/examples/int8_training/run_adalora_whisper_int8.sh b/peft/examples/int8_training/run_adalora_whisper_int8.sh new file mode 100644 index 0000000000000000000000000000000000000000..39bd69d5f1287401b2bf54bd345b76c8f039d50b --- /dev/null +++ b/peft/examples/int8_training/run_adalora_whisper_int8.sh @@ -0,0 +1,37 @@ +accelerate launch --config_file config.yaml peft_adalora_whisper_large_training.py \ + --model_name_or_path "openai/whisper-large-v2" \ + --language "Marathi" \ + --language_abbr "mr" \ + --task "transcribe" \ + --dataset_name "mozilla-foundation/common_voice_11_0" \ + --push_to_hub \ + --preprocessing_num_workers 2 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --dataloader_pin_memory \ + --dataloader_num_workers 2 \ + --learning_rate 1e-3 \ + --weight_decay 1e-4 \ + --num_train_epochs 3 \ + --gradient_accumulation_steps 1 \ + --lr_scheduler_type "linear" \ + --num_warmup_steps 50 \ + --output_dir "adalora_whisper_large_marathi_multi_adapter" \ + --seed 42 \ + --load_best_model \ + --with_tracking \ + --report_to "wandb" \ + --hub_token $HUB_TOKEN \ + --checkpointing_steps 2000 \ + --evaluation_steps 2000 \ + --logging_steps 25 \ + --use_peft \ + --use_adalora \ + --init_r 12 \ + --target_r 8 \ + --tinit 100 \ + --tfinal 800 \ + --delta_t 10 \ + --lora_alpha 32 \ + --lora_dropout 0.1 \ + --orth_reg_weight 0.5 \ No newline at end of file diff --git a/peft/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb b/peft/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b03d8524cc73b04c4296f55fd2a26a5e14774323 --- /dev/null +++ b/peft/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb @@ -0,0 +1,801 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "546b6c6d-f949-4387-9c41-6989223911f8", + "metadata": {}, + "source": [ + "# Initializing weights with LoftQ by replacing LoRA weights in-place" + ] + }, + { + "cell_type": "markdown", + "id": "d041ecb4-6957-467e-8f3e-d4a12c674e9f", + "metadata": {}, + "source": [ + "This notebook shows how to apply [LoftQ](https://huggingface.co/papers/2310.08659) initialization on our QLoRA model.\n", + "\n", + "In short, the idea behind LoftQ is the following. When we use QLoRA, i.e. we quantize the base model with bitsandbytes to save memory, and then train LoRA weights on top of this base model, we expect a certain performance gap. This is partly due to the fact that quantization is onyl an approximation of the \"real\" weights and thus introduces a quantization error. By default, LoRA weights are initialized such that they are a no-op at the start of the training. However, we can instead initialize them so that they minimize the quantization error. This is the idea behind LoftQ.\n", + "\n", + "Note that this only influences the initialization of the model. Everything that follows stays the same as always." + ] + }, + { + "cell_type": "markdown", + "id": "90d5420f-de32-42fa-8792-247f60e3647d", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a2c69b7c-c922-405f-aae1-ccc4f6911155", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "22be0432-8798-44a2-9014-d929525e3059", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f087ce0f-71b4-45ec-b2f9-197677bbc1ee", + "metadata": {}, + "outputs": [], + "source": [ + "from peft import get_peft_model, LoraConfig, replace_lora_weights_loftq" + ] + }, + { + "cell_type": "markdown", + "id": "63fdf18e-4ac4-409e-8475-88147cf85067", + "metadata": {}, + "source": [ + "## Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "af14bd0a-597e-446c-800b-619fc0599ee0", + "metadata": {}, + "outputs": [], + "source": [ + "def get_mae(x, y):\n", + " return (x - y).abs().mean()\n", + "\n", + "\n", + "def get_mse(x, y):\n", + " return torch.pow(x - y, 2).mean()\n", + "\n", + "\n", + "def error_report(x, y):\n", + " mae = get_mae(x, y)\n", + " mse = get_mse(x, y)\n", + " print(\n", + " f\"Mean absolute error: {mae:>8.5f}\\n\"\n", + " f\"Mean squared error: {mse:>8.5f}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "1bc01a5f-7ee8-400f-8e80-3f2b7df29882", + "metadata": {}, + "source": [ + "## Base model" + ] + }, + { + "cell_type": "markdown", + "id": "fdc447d9-2f4f-4d0f-afdb-1cf5c4237321", + "metadata": {}, + "source": [ + "First, let's load a base model and calculate some logits. These logits are the baseline, i.e. we try to match their values as best as possible. We only need these logits for demonstration purposes. In practice, it is not necessary to load the non-quantized weights to apply LoftQ initialization.\n", + "\n", + "**Note**: We have to choose a model with a `model.safetensors` file. As PyTorch checkpoints (pickle) cannot be loaded lazily, we have to use [safetensors](https://huggingface.co/docs/safetensors/index). If those don't exist for your model, save the pretrained model as a safetensors file using `safe_pretrained` and pass the model path to `replace_lora_weights_loftq`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0cb29074-d180-4fdc-8a47-27d2b9857264", + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"bigscience/bloomz-560m\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e7ddd6a2-04dd-42ec-9f48-100a3946ae04", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1f5b27db-51cc-41da-a21d-049ff747a149", + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForCausalLM.from_pretrained(model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "51548b6a-945c-4797-b02a-0e3fc77d1242", + "metadata": {}, + "outputs": [], + "source": [ + "s = \"\"\"Beautiful is better than ugly.\n", + "Explicit is better than implicit.\n", + "Simple is better than complex.\n", + "Complex is better than complicated.\n", + "Flat is better than nested.\n", + "Sparse is better than dense.\n", + "Readability counts.\n", + "Special cases aren't special enough to break the rules.\n", + "Although practicality beats purity.\n", + "Errors should never pass silently.\n", + "Unless explicitly silenced.\n", + "In the face of ambiguity, refuse the temptation to guess.\n", + "There should be one-- and preferably only one --obvious way to do it.\n", + "Although that way may not be obvious at first unless you're Dutch.\n", + "Now is better than never.\n", + "Although never is often better than *right* now.\n", + "If the implementation is hard to explain, it's a bad idea.\n", + "If the implementation is easy to explain, it may be a good idea.\n", + "Namespaces are one honking great idea -- let's do more of those!\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ce72d923-5283-48ba-96ef-7f859309ad84", + "metadata": {}, + "outputs": [], + "source": [ + "inputs = tokenizer(s.splitlines(), return_tensors=\"pt\", padding=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3bfe54cb-76ef-4981-ba25-3e544d264c62", + "metadata": {}, + "source": [ + "Our baseline logits:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "04bebcaa-3a05-4621-9a03-e25de72fa27c", + "metadata": {}, + "outputs": [], + "source": [ + "logits_base = model(**inputs).logits" + ] + }, + { + "cell_type": "markdown", + "id": "fa9c9001-8ade-422d-92f8-bcafa50917c7", + "metadata": {}, + "source": [ + "## Normal LoRA model" + ] + }, + { + "cell_type": "markdown", + "id": "8024390b-736a-4b21-848b-aa4f30951d51", + "metadata": {}, + "source": [ + "Now we load the model quantized with bitsandbytes. For now, only 4bit is supported." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "01d1912a-646e-42d2-8292-6702b77d1948", + "metadata": {}, + "outputs": [], + "source": [ + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.float16,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b1218717-4db4-48ce-978d-c05dc190fa91", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`low_cpu_mem_usage` was None, now set to True since model is quantized.\n" + ] + } + ], + "source": [ + "model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)" + ] + }, + { + "cell_type": "markdown", + "id": "a0b4e4c5-3932-4d9a-9457-41a05f24d556", + "metadata": {}, + "source": [ + "Next we create a LoRA model using PEFT and compute the logits of that model." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4741bce0-cd2b-4f05-a50c-4f9e56b43e72", + "metadata": {}, + "outputs": [], + "source": [ + "lora_config = LoraConfig(task_type=\"CAUSAL_LM\", target_modules=\"all-linear\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cf55cc48-b55d-4806-b6ab-e9b8035ed526", + "metadata": {}, + "outputs": [], + "source": [ + "peft_model = get_peft_model(model, lora_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f2f11e25-4a1e-485b-be4c-65aec62ac207", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ".../bitsandbytes/nn/modules.py:391: UserWarning: Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.\n", + " warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')\n" + ] + } + ], + "source": [ + "logits_lora = peft_model(**inputs).logits" + ] + }, + { + "cell_type": "markdown", + "id": "5bc0cde7-0b9f-4305-ac0e-e3a6d2cfa401", + "metadata": {}, + "source": [ + "Let's check the influence of the quantization error on our logits:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6f404c0d-f428-4923-9122-7b830410f089", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean absolute error: 3.61113\n", + "Mean squared error: 36.53259\n" + ] + } + ], + "source": [ + "error_report(logits_base, logits_lora)" + ] + }, + { + "cell_type": "markdown", + "id": "58c437e1-4fae-4a2f-9c42-ada6bedb9a4d", + "metadata": {}, + "source": [ + "## LoftQ" + ] + }, + { + "cell_type": "markdown", + "id": "1af05376-c8b0-48ec-8d80-7d7f4d32bbd7", + "metadata": {}, + "source": [ + "Next, let's use LoftQ initialization and see if it helps reduce the error." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "890e6108-3f02-469c-9e7d-f2144448227c", + "metadata": {}, + "outputs": [], + "source": [ + "replace_lora_weights_loftq(peft_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b452db0e-a510-42d3-bef5-f567186e26c2", + "metadata": {}, + "outputs": [], + "source": [ + "logits_loftq = peft_model(**inputs).logits" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "456dc564-f268-4cf3-9d59-a6942d3733ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean absolute error: 3.24111\n", + "Mean squared error: 31.13725\n" + ] + } + ], + "source": [ + "error_report(logits_base, logits_loftq)" + ] + }, + { + "cell_type": "markdown", + "id": "1ddf9e0f-3f78-426c-be59-77c6481674ec", + "metadata": {}, + "source": [ + "We can see that LoftQ initialization helped a little bit, but the difference is not huge." + ] + }, + { + "cell_type": "markdown", + "id": "0dd344f2-249c-4fe9-8357-7fe3bcd1e82f", + "metadata": {}, + "source": [ + "## LoftQ with callback" + ] + }, + { + "cell_type": "markdown", + "id": "e2fd7dd5-88b3-40b8-95c2-3f3895d8093d", + "metadata": {}, + "source": [ + "To help with this, let's write a small callback function and pass it to `replace_lora_weights_loftq`. What this function does is that each time one weight is being replaced with LoftQ-initialized weights, we perform a test if the quantization error is actually reduced. If it it is not, we roll back the replacement. This way, we keep only those replacements that improve the results." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "1f882802-22b7-4969-919e-120b1f2893d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`low_cpu_mem_usage` was None, now set to True since model is quantized.\n" + ] + } + ], + "source": [ + "# Since PEFT has modified the base model, we should reload it\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "c6438363-b66e-4507-8667-5a6df379a03f", + "metadata": {}, + "outputs": [], + "source": [ + "peft_model = get_peft_model(model, lora_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7b93d082-0fcb-4b20-982e-c1aaf0c71d13", + "metadata": {}, + "outputs": [], + "source": [ + "current_mse = float(\"inf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e22eb18d-b06e-47fe-91ba-ff34cbf62f60", + "metadata": {}, + "outputs": [], + "source": [ + "def my_callback(model, module_name):\n", + " \"\"\"Callable to replace weights with LoFTQ if the mse is lower than the current best one.\"\"\"\n", + " global current_mse\n", + "\n", + " logits = model(**inputs).logits\n", + " mse = get_mse(logits_base, logits)\n", + " if mse < current_mse:\n", + " current_mse = mse\n", + " print(f\"MSE improved for module {module_name}\")\n", + " return True\n", + " print(f\"MSE did not improve for module {module_name}\")\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "44ee90d1-e15a-4740-a39d-ebf9e7adb79c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE improved for module transformer.h.0.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.0.self_attention.dense\n", + "MSE improved for module transformer.h.0.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.0.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.1.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.1.self_attention.dense\n", + "MSE did not improve for module transformer.h.1.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.1.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.2.self_attention.query_key_value\n", + "MSE improved for module transformer.h.2.self_attention.dense\n", + "MSE improved for module transformer.h.2.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.2.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.3.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.3.self_attention.dense\n", + "MSE improved for module transformer.h.3.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.3.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.4.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.4.self_attention.dense\n", + "MSE improved for module transformer.h.4.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.4.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.5.self_attention.query_key_value\n", + "MSE improved for module transformer.h.5.self_attention.dense\n", + "MSE improved for module transformer.h.5.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.5.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.6.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.6.self_attention.dense\n", + "MSE improved for module transformer.h.6.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.6.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.7.self_attention.query_key_value\n", + "MSE improved for module transformer.h.7.self_attention.dense\n", + "MSE did not improve for module transformer.h.7.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.7.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.8.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.8.self_attention.dense\n", + "MSE improved for module transformer.h.8.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.8.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.9.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.9.self_attention.dense\n", + "MSE did not improve for module transformer.h.9.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.9.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.10.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.10.self_attention.dense\n", + "MSE did not improve for module transformer.h.10.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.10.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.11.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.11.self_attention.dense\n", + "MSE did not improve for module transformer.h.11.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.11.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.12.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.12.self_attention.dense\n", + "MSE improved for module transformer.h.12.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.12.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.13.self_attention.query_key_value\n", + "MSE improved for module transformer.h.13.self_attention.dense\n", + "MSE did not improve for module transformer.h.13.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.13.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.14.self_attention.query_key_value\n", + "MSE improved for module transformer.h.14.self_attention.dense\n", + "MSE did not improve for module transformer.h.14.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.14.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.15.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.15.self_attention.dense\n", + "MSE did not improve for module transformer.h.15.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.15.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.16.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.16.self_attention.dense\n", + "MSE improved for module transformer.h.16.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.16.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.17.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.17.self_attention.dense\n", + "MSE improved for module transformer.h.17.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.17.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.18.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.18.self_attention.dense\n", + "MSE did not improve for module transformer.h.18.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.18.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.19.self_attention.query_key_value\n", + "MSE improved for module transformer.h.19.self_attention.dense\n", + "MSE improved for module transformer.h.19.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.19.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.20.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.20.self_attention.dense\n", + "MSE did not improve for module transformer.h.20.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.20.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.21.self_attention.query_key_value\n", + "MSE improved for module transformer.h.21.self_attention.dense\n", + "MSE did not improve for module transformer.h.21.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.21.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.22.self_attention.query_key_value\n", + "MSE improved for module transformer.h.22.self_attention.dense\n", + "MSE improved for module transformer.h.22.mlp.dense_h_to_4h\n", + "MSE improved for module transformer.h.22.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.23.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.23.self_attention.dense\n", + "MSE improved for module transformer.h.23.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.23.mlp.dense_4h_to_h\n" + ] + } + ], + "source": [ + "replace_lora_weights_loftq(peft_model, callback=my_callback)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e31adc81-a090-49b2-90f6-9906743c76ae", + "metadata": {}, + "outputs": [], + "source": [ + "logits_loftq_callback = peft_model(**inputs).logits" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7c640092-1f26-48be-bea4-487511205440", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean absolute error: 1.79576\n", + "Mean squared error: 8.47075\n" + ] + } + ], + "source": [ + "error_report(logits_base, logits_loftq_callback)" + ] + }, + { + "cell_type": "markdown", + "id": "1896857e-3d87-44a9-887f-90c765bc8d91", + "metadata": {}, + "source": [ + "We can see that applying LoftQ with the help of the callback reduced the error quite significantly." + ] + }, + { + "cell_type": "markdown", + "id": "8eaf86cf-4fb4-455d-ab07-892591564303", + "metadata": {}, + "source": [ + "## Applying LoftQ multiple times" + ] + }, + { + "cell_type": "markdown", + "id": "70836a75-5c6d-4b7b-9175-f395aef8383b", + "metadata": {}, + "source": [ + "It is possible to run `replace_lora_weights_loftq` multiple times on the same model when using the callback." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8e5ee38c-007c-4c75-9248-005d94b19445", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MSE did not improve for module transformer.h.0.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.0.self_attention.dense\n", + "MSE did not improve for module transformer.h.0.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.0.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.1.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.1.self_attention.dense\n", + "MSE did not improve for module transformer.h.1.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.1.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.2.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.2.self_attention.dense\n", + "MSE did not improve for module transformer.h.2.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.2.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.3.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.3.self_attention.dense\n", + "MSE did not improve for module transformer.h.3.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.3.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.4.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.4.self_attention.dense\n", + "MSE did not improve for module transformer.h.4.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.4.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.5.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.5.self_attention.dense\n", + "MSE did not improve for module transformer.h.5.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.5.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.6.self_attention.query_key_value\n", + "MSE improved for module transformer.h.6.self_attention.dense\n", + "MSE did not improve for module transformer.h.6.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.6.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.7.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.7.self_attention.dense\n", + "MSE did not improve for module transformer.h.7.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.7.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.8.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.8.self_attention.dense\n", + "MSE did not improve for module transformer.h.8.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.8.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.9.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.9.self_attention.dense\n", + "MSE did not improve for module transformer.h.9.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.9.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.10.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.10.self_attention.dense\n", + "MSE improved for module transformer.h.10.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.10.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.11.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.11.self_attention.dense\n", + "MSE did not improve for module transformer.h.11.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.11.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.12.self_attention.query_key_value\n", + "MSE improved for module transformer.h.12.self_attention.dense\n", + "MSE did not improve for module transformer.h.12.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.12.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.13.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.13.self_attention.dense\n", + "MSE did not improve for module transformer.h.13.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.13.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.14.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.14.self_attention.dense\n", + "MSE did not improve for module transformer.h.14.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.14.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.15.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.15.self_attention.dense\n", + "MSE did not improve for module transformer.h.15.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.15.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.16.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.16.self_attention.dense\n", + "MSE did not improve for module transformer.h.16.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.16.mlp.dense_4h_to_h\n", + "MSE improved for module transformer.h.17.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.17.self_attention.dense\n", + "MSE did not improve for module transformer.h.17.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.17.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.18.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.18.self_attention.dense\n", + "MSE did not improve for module transformer.h.18.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.18.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.19.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.19.self_attention.dense\n", + "MSE did not improve for module transformer.h.19.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.19.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.20.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.20.self_attention.dense\n", + "MSE did not improve for module transformer.h.20.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.20.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.21.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.21.self_attention.dense\n", + "MSE did not improve for module transformer.h.21.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.21.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.22.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.22.self_attention.dense\n", + "MSE did not improve for module transformer.h.22.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.22.mlp.dense_4h_to_h\n", + "MSE did not improve for module transformer.h.23.self_attention.query_key_value\n", + "MSE did not improve for module transformer.h.23.self_attention.dense\n", + "MSE did not improve for module transformer.h.23.mlp.dense_h_to_4h\n", + "MSE did not improve for module transformer.h.23.mlp.dense_4h_to_h\n" + ] + } + ], + "source": [ + "replace_lora_weights_loftq(peft_model, callback=my_callback)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "2abe2702-9510-4814-b5f2-63140a102c17", + "metadata": {}, + "outputs": [], + "source": [ + "logits_loftq_callback_twice = peft_model(**inputs).logits" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e908de14-01f9-4fdc-91b5-61118a3ce6cb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean absolute error: 1.76357\n", + "Mean squared error: 8.33938\n" + ] + } + ], + "source": [ + "error_report(logits_base, logits_loftq_callback_twice)" + ] + }, + { + "cell_type": "markdown", + "id": "5b8b09fe-d369-4444-b6e2-cd514e775637", + "metadata": {}, + "source": [ + "There are further gains, but they are not very big." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/loftq_finetuning/README.md b/peft/examples/loftq_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b8d204d3cea02c7a71ce61252d390c696c74d759 --- /dev/null +++ b/peft/examples/loftq_finetuning/README.md @@ -0,0 +1,144 @@ +# LoftQ: LoRA-fine-tuning-aware Quantization + +## Introduction + +LoftQ finds quantized LoRA initialization: quantized backbone Q and LoRA adapters A and B, given a pre-trained weight W. + +## Quick Start +Steps: + +1. Apply LoftQ to a full-precision pre-trained weight and save. +2. Load LoftQ initialization and train. + +For step 1, we have provided off-the-shelf LoftQ initializations (see [supported model list](#appendix-off-the-shelf-model-list)) +in [Huggingface Hub LoftQ](https://huggingface.co/LoftQ). +If you want to do it yourself, jump to [LoftQ DIY](#loftq-diy). + +For step 2, below is an example of loading 4bit Mistral-7B with 64rank LoRA adapters from Huggingface Hub. +```python +import torch +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from peft import PeftModel + +MODEL_ID = "LoftQ/Mistral-7B-v0.1-4bit-64rank" + +base_model = AutoModelForCausalLM.from_pretrained( + MODEL_ID, + torch_dtype=torch.bfloat16, # you may change it with different models + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, # bfloat16 is recommended + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type='nf4', + ), +) +peft_model = PeftModel.from_pretrained( + base_model, + MODEL_ID, + subfolder="loftq_init", + is_trainable=True, +) + +# Do training with peft_model ... +``` + +## LoftQ DIY + +### Apply LoftQ and save +We provide [quantize_save_load.py](quantize_save_load.py) as an example to apply LoftQ with +different bits(`--bits`), ranks(`--rank`), and alternating steps (`--iter`, a hyper-parameter in LoftQ, see Algorithm 1 in [LoftQ paper](https://huggingface.co/papers/2310.08659)). Currently, this example supports +`llama-2`, `falcon`, `mistral`, `bart`, `t5`, `deberta`, `bert`, `roberta`. + +Below is an example of obtaining 4bit LLAMA-2-7b with 16-rank LoRA adapters by 5 alternating steps. +```sh +SAVE_DIR="model_zoo/loftq/" +python quantize_save_load.py \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ # high-precision model id in HF + --token HF_TOKEN \ # your HF token if the model is private, e.g., llama-2 + --bits 4 \ + --iter 5 \ + --rank 16 \ + --save_dir $SAVE_DIR +``` + +The above commands end up with creating the model directory under `$SAVE_DIR`. +Specifically, the model directory is named as + +`MODEL_DIR = SAVE_DIR + f"{args.model_name_or_path.split('/')[-1]}-{args.bits}bits-{args.rank}rank"` + +In this example, `MODEL_DIR="model_zoo/loftq/Llama-2-7b-hf-4bit-16rank"`, where the backbone is stored in `$MODEL_DIR` +and the LoRA adapters are at the sub-folder `$MODEL_DIR/loftq_init`. + +### Load and train +Similar to loading from Huggingface Hub, we only need to change the `MODEL_ID` to the `MODEL_DIR`. + +```python +import torch +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from peft import PeftModel + +MODEL_DIR = "model_zoo/loftq/Llama-2-7b-hf-4bit-16rank" + +base_model = AutoModelForCausalLM.from_pretrained( + MODEL_DIR, + torch_dtype=torch.bfloat16, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type='nf4', + ), +) +peft_model = PeftModel.from_pretrained( + base_model, + MODEL_DIR, + subfolder="loftq_init", + is_trainable=True, +) +# Do training with peft_model ... +``` + +## LoftQ Fine-tuning + +We also provide an example to fine-tune LoftQ on GSM8K. +We load the quantized backbone and LoRA adapters from the [LoftQ Huggingface hub](https://huggingface.co/LoftQ). + +```sh +python train_gsm8k_llama.py \ + --model_name_or_path LoftQ/Llama-2-13b-hf-4bit-64rank \ + --output_dir exp_results/gsm8k/llama-2-13b/bit4-rank64/lr1e-4 \ + --learning_rate 1e-4 \ + --weight_decay 0.1 \ + --lr_scheduler_type cosine \ + --num_warmup_steps 100 \ + --seed 202 \ + --dataset_name gsm8k \ + --dataset_config main \ + --pad_to_max_length \ + --max_source_length 128 \ + --max_target_length 256 \ + --num_train_epochs 5 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --gradient_accumulation_steps 4 \ + --with_tracking \ + --report_to tensorboard +``` + + +## Appendix: Off-the-shelf Model List +| Model Name | Bits | Ranks | +| ----------- | ---- | ----- | +| LLAMA-2-7b | 4 | 64 | +| LLAMA-2-13b | 4 | 64 | +| LLAMA-2-70b | 4 | 64 | +| Mistral | 4 | 64 | +| Mistral | 4 | 32 | +| BART-large | 4 | 8 | +| BART-large | 4 | 16 | +| BART-large | 4 | 32 | +| BART-large | 2 | 8 | + +## In-place application of LoftQ initialization + +PEFT provides a convenience function `replace_lora_weights_loftq` to apply LoftQ initialization in-place to the quantized model. Check out [this notebook](https://github.com/huggingface/peft/blob/main/examples/loftq_finetuning/LoftQ_weight_replacement.ipynb) for an example. diff --git a/peft/examples/loftq_finetuning/quantize_save_load.py b/peft/examples/loftq_finetuning/quantize_save_load.py new file mode 100644 index 0000000000000000000000000000000000000000..2110ed908684a658896bbfbe065130cfa35490a6 --- /dev/null +++ b/peft/examples/loftq_finetuning/quantize_save_load.py @@ -0,0 +1,193 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import torch +import torch.nn as nn +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoTokenizer, +) + +from peft import LoftQConfig, LoraConfig, TaskType, get_peft_model + + +class Shell(nn.Module): + def __init__(self, weight, bias=None): + super().__init__() + self.weight = nn.Parameter(weight, requires_grad=False) + if bias is not None: + self.bias = nn.Parameter(bias, requires_grad=False) + + +def unwrap_model(model, sub_module_name=".base_layer"): + sub_module_name_list = [k.split(sub_module_name)[0] for k in model.state_dict().keys() if sub_module_name in k] + sub_module_name_set = set(sub_module_name_list) + for name in sub_module_name_set: + # get the parent of the submodule + name_parent = ".".join(name.split(".")[:-1]) + name_child = name.split(".")[-1] + sub_module = model.get_submodule(name_parent) + print(sub_module) + + # replace with shell + child = getattr(sub_module, name_child) + weight = getattr(child.base_layer, "weight", None) + bias = getattr(child.base_layer, "bias", None) + shell = Shell(weight, bias) + + setattr(sub_module, name_child, shell) + + print("You have unwrapped the model. Use it on your own risk.") + + +def print_model(model, name): + print("=" * 10 + name + "=" * 10) + print(model) + for name, param in model.named_parameters(): + if torch.is_tensor(param): + if param.dtype in [torch.float32, torch.float16]: + print( + name, + param.shape, + param.device, + param.dtype, + param.requires_grad, + param.mean().item(), + param.max().item(), + ) + else: + print(name, param.shape, param.device, param.dtype, param.requires_grad) + + +def arg_parse(): + parser = argparse.ArgumentParser(description="Quantize a model with LoftQ.") + parser.add_argument( + "--model_name_or_path", + type=str, + default=None, + required=True, + help="The name or path of the fp32/16 model.", + ) + parser.add_argument( + "--token", + type=str, + default=None, + help="The access token to download model from HuggingFace Hub.", + ) + parser.add_argument( + "--bits", + type=int, + default=4, + help="The quantized bits", + ) + parser.add_argument( + "--iter", + type=int, + default=1, + help="The alternating steps in LoftQ", + ) + parser.add_argument( + "--rank", + type=int, + default=16, + help="The rank of the LoRA adapter", + ) + parser.add_argument( + "--save_dir", + type=str, + default="./model_zoo/loftq/", + help="The rank of the LoRA adapter", + ) + args = parser.parse_args() + return args + + +def quantize_and_save(): + args = arg_parse() + + # Download weights and configure LoRA + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True) + if any(name in args.model_name_or_path.lower() for name in ["llama", "mistral", "falcon"]): + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, token=args.token, trust_remote_code=True) + task_type = TaskType.CAUSAL_LM + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"] + + elif any(name in args.model_name_or_path.lower() for name in ["bart", "t5"]): + model = AutoModelForSeq2SeqLM.from_pretrained(args.model_name_or_path, token=args.token) + task_type = TaskType.SEQ_2_SEQ_LM + target_modules = ["q_proj", "k_proj", "v_proj", "fc1", "fc2", "out_proj"] + + elif any(name in args.model_name_or_path.lower() for name in ["deberta", "roberta", "bert"]): + model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path, token=args.token) + task_type = TaskType.SEQ_CLS + target_modules = ["query_proj", "key_proj", "value_proj", "dense"] # embeddings not supported by peft + else: + raise NotImplementedError("Other models not supported yet.") + + # Config of LoftQ + loftq_config = LoftQConfig(loftq_bits=args.bits, loftq_iter=args.iter) + + lora_config = LoraConfig( + task_type=task_type, + inference_mode=True, + r=args.rank, + lora_alpha=16 if task_type is TaskType.CAUSAL_LM else args.rank, + lora_dropout=0.1, + target_modules=target_modules, + init_lora_weights="loftq", + loftq_config=loftq_config, + ) + + # Obtain LoftQ model + lora_model = get_peft_model(model, lora_config) + base_model = lora_model.get_base_model() + + # Save LoftQ model + model_name = args.model_name_or_path.split("/")[-1] + f"-{args.bits}bit" + f"-{args.rank}rank" + base_model_dir = os.path.join(args.save_dir, model_name) + lora_model_dir = os.path.join(args.save_dir, model_name, "loft_init") + + # save lora adapters first + lora_model.base_model.peft_config[ + "default" + ].base_model_name_or_path = base_model_dir # This can be a local path or Hub model id + lora_model.base_model.peft_config["default"].init_lora_weights = True # Don't apply LoftQ when loading again + + lora_model.save_pretrained(lora_model_dir) + print_model(lora_model, "lora_model") + + # remove lora adapters and save the backbone + unwrap_model(base_model) + base_model.save_pretrained(base_model_dir) + tokenizer.save_pretrained(base_model_dir) + + print_model(base_model, "base_model") + + return base_model_dir, lora_model_dir + + +if __name__ == "__main__": + base_dir, lora_dir = quantize_and_save() + +# example command: +# python quantize_save_load.py \ +# --model_name_or_path meta-llama/Llama-2-7b-hf \ +# --token XXX \ +# --bits 4 --iter 5 --rank 16 \ +# --save_dir ./model_zoo/loftq/ diff --git a/peft/examples/loftq_finetuning/train_gsm8k_llama.py b/peft/examples/loftq_finetuning/train_gsm8k_llama.py new file mode 100644 index 0000000000000000000000000000000000000000..66b83d55e38d9c7c20231d08e371ed15aea23ca9 --- /dev/null +++ b/peft/examples/loftq_finetuning/train_gsm8k_llama.py @@ -0,0 +1,851 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy +import logging +import math +import os +import random +import re +from pathlib import Path + +import datasets +import torch +import transformers +from accelerate import Accelerator, DistributedType +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from datasets import load_dataset +from huggingface_hub import HfApi +from torch.utils.data import DataLoader +from tqdm.auto import tqdm +from transformers import ( + CONFIG_MAPPING, + MODEL_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + SchedulerType, + default_data_collator, + get_scheduler, +) +from transformers.utils import send_example_telemetry +from transformers.utils.versions import require_version + +from peft import PeftModel + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +# check_min_version("4.32.0.dev0") + +logger = get_logger(__name__) + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") + parser.add_argument( + "--dataset_name", + type=str, + default=None, + help="The name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The configuration name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--train_file", type=str, default=None, help="A csv, txt or a json file containing the training data." + ) + parser.add_argument( + "--validation_file", type=str, default=None, help="A csv, txt or a json file containing the validation data." + ) + parser.add_argument( + "--validation_split_percentage", + default=5, + help="The percentage of the train set used as validation set in case there's no validation split", + ) + parser.add_argument( + "--model_name_or_path", + type=str, + help="Path to pretrained model or model identifier from huggingface.co/models.", + required=False, + ) + parser.add_argument( + "--config_name", + type=str, + default=None, + help="Pretrained config name or path if not the same as model_name", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--use_slow_tokenizer", + action="store_true", + help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", + ) + parser.add_argument( + "--per_device_train_batch_size", + type=int, + default=8, + help="Batch size (per device) for the training dataloader.", + ) + parser.add_argument( + "--per_device_eval_batch_size", + type=int, + default=8, + help="Batch size (per device) for the evaluation dataloader.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-5, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") + parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--lr_scheduler_type", + type=SchedulerType, + default="linear", + help="The scheduler type to use.", + choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], + ) + parser.add_argument( + "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--model_type", + type=str, + default=None, + help="Model type to use if training from scratch.", + choices=MODEL_TYPES, + ) + parser.add_argument( + "--ignore_pad_token_for_loss", + type=bool, + default=True, + help="Whether to ignore the tokens corresponding to padded labels in the loss computation or not.", + ) + parser.add_argument( + "--max_source_length", + type=int, + default=128, + help=( + "The maximum total input sequence length after " + "tokenization.Sequences longer than this will be truncated, sequences shorter will be padded." + ), + ) + parser.add_argument( + "--max_target_length", + type=int, + default=128, + help=( + "The maximum total sequence length for target text after " + "tokenization. Sequences longer than this will be truncated, sequences shorter will be padded." + "during ``evaluate`` and ``predict``." + ), + ) + parser.add_argument( + "--pad_to_max_length", + action="store_true", + help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.", + ) + parser.add_argument( + "--preprocessing_num_workers", + type=int, + default=None, + help="The number of processes to use for the preprocessing.", + ) + parser.add_argument( + "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" + ) + parser.add_argument( + "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files." + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument( + "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." + ) + parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--trust_remote_code", + type=bool, + default=False, + help=( + "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option" + "should only be set to `True` for repositories you trust and in which you have read the code, as it will" + "execute code present on the Hub on your local machine." + ), + ) + parser.add_argument( + "--checkpointing_steps", + type=str, + default=None, + help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help="If the training should continue from a checkpoint folder.", + ) + parser.add_argument( + "--with_tracking", + action="store_true", + help="Whether to enable experiment trackers for logging.", + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' + ' `"wandb"`, `"comet_ml"` and `"clearml"`. Use `"all"` (default) to report to all integrations.' + "Only applicable when `--with_tracking` is passed." + ), + ) + parser.add_argument( + "--low_cpu_mem_usage", + action="store_true", + help=( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." + "If passed, LLM loading time and RAM consumption will be benefited." + ), + ) + ########################## + # Generation Config # + ########################## + parser.add_argument( + "--temperature", + type=float, + default=0.8, + help="temperature of 1.0 has no effect, lower tend toward greedy sampling", + ) + parser.add_argument("--k", type=int, default=40, help="Choose k candidate words") + parser.add_argument("--p", type=float, default=0.95, help="The sum of probability of candidate words is 0.9 ") + + ########################## + # Exp Args # + ########################## + parser.add_argument( + "--adapter_name_or_path", + type=str, + default=None, + help=( + "The LoRA adapter checkpoint. Set None if you want to fine-tune from LoftQ." + "Specify a path if you want to evaluate." + ), + ) + + args = parser.parse_args() + + # Sanity checks + if args.dataset_name is None and args.train_file is None and args.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if args.train_file is not None: + extension = args.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." + if args.validation_file is not None: + extension = args.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." + + if args.push_to_hub: + assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." + + return args + + +def main(): + args = parse_args() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_clm_no_trainer", args) + + # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. + # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers + # in the environment + accelerator_log_kwargs = {} + + if args.with_tracking: + accelerator_log_kwargs["log_with"] = args.report_to + accelerator_log_kwargs["project_dir"] = args.output_dir + + accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + accelerator.wait_for_everyone() + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[:{args.validation_split_percentage}%]", + ) + raw_datasets["train"] = load_dataset( + args.dataset_name, + args.dataset_config_name, + split=f"train[{args.validation_split_percentage}%:]", + ) + else: + data_files = {} + dataset_args = {} + if args.train_file is not None: + data_files["train"] = args.train_file + if args.validation_file is not None: + data_files["validation"] = args.validation_file + extension = args.train_file.split(".")[-1] + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks + raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + if args.config_name: + config = AutoConfig.from_pretrained( + args.config_name, + trust_remote_code=args.trust_remote_code, + ) + elif args.model_name_or_path: + config = AutoConfig.from_pretrained( + args.model_name_or_path, + trust_remote_code=args.trust_remote_code, + ) + else: + config = CONFIG_MAPPING[args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer_name, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code + ) + elif args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, + use_fast=not args.use_slow_tokenizer, + trust_remote_code=args.trust_remote_code, + ) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + ########################## + # Tokenizer # + ########################## + tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token + tokenizer.padding_side = "left" # Allow batched inference + tokenizer.truncation_side = "left" + + if args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + from_tf=bool(".ckpt" in args.model_name_or_path), + config=config, + low_cpu_mem_usage=True, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=config.torch_dtype, + ), + ) + else: + logger.info("Training new model from scratch") + model = AutoModelForCausalLM.from_config(config, trust_remote_code=args.trust_remote_code) + + ########################## + # Peft Model # + ########################## + if args.adapter_name_or_path is None: + model = PeftModel.from_pretrained(model, args.model_name_or_path, subfolder="loftq_init", is_trainable=True) + else: + model = PeftModel.from_pretrained(model, args.adapter_name_or_path, is_trainable=True) + model.print_trainable_parameters() + + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + ########################## + # GSM8K dataset # + ########################## + + # Preprocessing the datasets. + # First we tokenize all the texts. + column_names = raw_datasets["train"].column_names + + # Get the column names for source/target. + source_column, target_column = "question", "answer" + + # Temporarily set max_target_length for training. + padding = "max_length" if args.pad_to_max_length else False + task_prompt = "\nAnswer the above question. First think step by step and then answer the final number.\n" + + def prompt_process(sent_1, sent_2, prompt_1="", prompt_2="", prompt_3=""): + sent_2 = sent_2.replace("####", "The final answer is") + return prompt_1 + sent_1 + prompt_2 + sent_2 + prompt_3 + + def preprocess_function_train(examples): + sources = examples[source_column] + targets = examples[target_column] + + inputs = [prompt_process(source, target, prompt_2=task_prompt) for (source, target) in zip(sources, targets)] + + model_inputs = tokenizer( + inputs, + max_length=args.max_source_length + args.max_target_length, + padding=padding, + truncation=True, + return_tensors="pt", + ) + + labels = copy.deepcopy(model_inputs) + + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and args.ignore_pad_token_for_loss: + # get the length of the target tokens. -1 to kick out the token + target_tokens = tokenizer(targets, padding=False) + target_len = [len(label) - 1 for label in target_tokens["input_ids"]] + + # don't calculate the loss from source and padding (left padding) + for i in range(len(labels["input_ids"])): + labels["input_ids"][i, : -target_len[i]] = -100 + + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + def preprocess_function_test(examples): + sources = examples[source_column] + labels = examples[target_column] + + inputs = [source + task_prompt for source in sources] + + model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) + labels = tokenizer(labels, max_length=args.max_target_length, padding=padding, truncation=True) + + model_inputs["labels"] = labels["input_ids"] + + return model_inputs + + with accelerator.main_process_first(): + train_dataset = raw_datasets["train"].map( + preprocess_function_train, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on training dataset", + ) + + eval_dataset = raw_datasets["test"].map( + preprocess_function_test, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on test dataset", + ) + + # Log a few random samples from the set: + for index in random.sample(range(len(train_dataset)), 2): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + for index in random.sample(range(len(eval_dataset)), 2): + logger.info(f"Sample {index} of the validation set: {eval_dataset[index]}.") + + # DataLoaders creation: + train_dataloader = DataLoader( + train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size + ) + eval_dataloader = DataLoader( + eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size + ) + + # Optimizer + # Split weights in two groups, one with weight decay and the other not. + no_decay = ["bias", "layer_norm.weight"] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and "lora" in n], + "weight_decay": args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], + "weight_decay": 0.0, + }, + ] + optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + name=args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + ) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, eval_dataloader, lr_scheduler + ) + + # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. + if accelerator.distributed_type == DistributedType.TPU: + model.tie_weights() + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # Figure out how many steps we should save the Accelerator states + checkpointing_steps = args.checkpointing_steps + if checkpointing_steps is not None and checkpointing_steps.isdigit(): + checkpointing_steps = int(checkpointing_steps) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if args.with_tracking: + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("clm_no_trainer", experiment_config) + + # Train! + total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) + completed_steps = 0 + starting_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": + checkpoint_path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] + dirs.sort(key=os.path.getctime) + path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last + checkpoint_path = path + path = os.path.basename(checkpoint_path) + + accelerator.print(f"Resumed from checkpoint: {checkpoint_path}") + accelerator.load_state(path) + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] + + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch + else: + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) + + for epoch in range(starting_epoch, args.num_train_epochs): + model.train() + if args.with_tracking: + total_loss = 0 + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + # We keep track of the loss at each epoch + if args.with_tracking: + total_loss += loss.detach().float() + accelerator.backward(loss) + if completed_steps % 50: + accelerator.print(f"Epoch: {epoch} | Step: {completed_steps} | Loss: {loss}") + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + completed_steps += 1 + + if isinstance(checkpointing_steps, int): + if completed_steps % checkpointing_steps == 0: + output_dir = f"step_{completed_steps}" + if args.output_dir is not None: + output_dir = os.path.join(args.output_dir, output_dir) + accelerator.save_state(output_dir) + if completed_steps >= args.max_train_steps: + break + + model.eval() + gen_kwargs = { + "max_new_tokens": args.max_target_length, + "temperature": args.temperature, + "top_k": args.k, + "top_p": args.p, + "do_sample": True, + } + ans_pred_list = [] + ans_gold_list = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + gen_kwargs["input_ids"] = batch["input_ids"] + gen_kwargs["attention_mask"] = batch["attention_mask"] + generated_tokens = accelerator.unwrap_model(model).generate(**gen_kwargs) + + pred_tokens = generated_tokens[:, args.max_source_length :] + pred_tokens = accelerator.pad_across_processes(pred_tokens, dim=1, pad_index=tokenizer.pad_token_id) + gold_tokens = batch["labels"] + + if not args.pad_to_max_length: + # If we did not pad to max length, we need to pad the labels too + gold_tokens = accelerator.pad_across_processes( + batch["labels"], dim=1, pad_index=tokenizer.pad_token_id + ) + + pred_tokens, gold_tokens = accelerator.gather_for_metrics((pred_tokens, gold_tokens)) + pred_tokens, gold_tokens = pred_tokens.cpu().numpy(), gold_tokens.cpu().numpy() + + if isinstance(pred_tokens, tuple): + pred_tokens = pred_tokens[0] + decoded_pred = tokenizer.batch_decode(pred_tokens, skip_special_tokens=True) + decoded_gold = tokenizer.batch_decode(gold_tokens, skip_special_tokens=True) + + # Extract the numbers in sentences + accelerator.print(decoded_pred) + ans_pred_list += [extract_answer_number(sentence_pred) for sentence_pred in decoded_pred] + ans_gold_list += [extract_answer_number(sentence_gold) for sentence_gold in decoded_gold] + + accelerator.print(ans_pred_list) + accelerator.print(ans_gold_list) + accuracy = compute_accuracy(ans_gold_list, ans_pred_list) + + logger.info(f"epoch {epoch}: accuracy: {accuracy}") + + if args.with_tracking: + accelerator.log( + { + "accuracy": accuracy, + "train_loss": total_loss.item() / len(train_dataloader), + "epoch": epoch, + "step": completed_steps, + }, + step=completed_steps, + ) + + if args.push_to_hub and epoch < args.num_train_epochs - 1: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained( + args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save + ) + if accelerator.is_main_process: + tokenizer.save_pretrained(args.output_dir) + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message=f"Training in progress epoch {epoch}", + run_as_future=True, + ) + + if args.checkpointing_steps == "epoch": + output_dir = f"epoch_{epoch}" + if args.output_dir is not None: + output_dir = os.path.join(args.output_dir, output_dir) + accelerator.save_state(output_dir) + + if args.with_tracking: + accelerator.end_training() + + if args.output_dir is not None: + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained( + args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save + ) + if accelerator.is_main_process: + tokenizer.save_pretrained(args.output_dir) + if args.push_to_hub: + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ) + + +PATTERN_NUMBER = re.compile(r"-?\d+\.?\d*") + + +def extract_answer_number(sentence: str) -> float: + sentence = sentence.replace(",", "") + pred = PATTERN_NUMBER.findall(sentence) + if not pred: + return float("inf") + segment = sentence.split("The final answer is ") + if len(segment) > 1: + pred_answer = segment[1] + pred_answer = PATTERN_NUMBER.findall(pred_answer) + if len(pred_answer) > 0: + pred_answer = pred_answer[0] + else: + pred_answer = float(pred[-1]) + else: + pred_answer = float(pred[-1]) + + if isinstance(pred_answer, str): + try: + pred_answer = float(pred_answer) + except ValueError: + pred_answer = float("inf") + return pred_answer + + +def compute_accuracy(pred: list, gold: list): + acc = 0.0 + for p, g in zip(pred, gold): + if p == g: + acc += 1 + + return acc / len(pred) + + +if __name__ == "__main__": + main() diff --git a/peft/examples/lora_dreambooth/colab_notebook.ipynb b/peft/examples/lora_dreambooth/colab_notebook.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91ac558aaf6728e2b9d0e501d99cbed0f053c3a5 --- /dev/null +++ b/peft/examples/lora_dreambooth/colab_notebook.ipynb @@ -0,0 +1,54 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kdOhtpergLCQ" + }, + "outputs": [], + "source": [ + "!git clone https://huggingface.co/spaces/smangrul/peft-lora-sd-dreambooth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_LuGk9mihPx7" + }, + "outputs": [], + "source": [ + "%cd \"peft-lora-sd-dreambooth\"\n", + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BYKO8e5ElJOX" + }, + "outputs": [], + "source": [ + "!python colab.py" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "gpuClass": "premium", + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/peft/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py b/peft/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py new file mode 100644 index 0000000000000000000000000000000000000000..a691ba7d053d83f64611db2763c5717d29abe7d3 --- /dev/null +++ b/peft/examples/lora_dreambooth/convert_kohya_ss_sd_lora_to_peft.py @@ -0,0 +1,175 @@ +import argparse +import os +from collections import Counter +from dataclasses import dataclass +from typing import Optional + +import safetensors +import torch +from diffusers import UNet2DConditionModel +from transformers import CLIPTextModel + +from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, set_peft_model_state_dict + + +# Default kohya_ss LoRA replacement modules +# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L661 +UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"] +UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"] +TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"] +LORA_PREFIX_UNET = "lora_unet" +LORA_PREFIX_TEXT_ENCODER = "lora_te" + + +@dataclass +class LoRAInfo: + kohya_key: str + peft_key: str + alpha: Optional[float] = None + rank: Optional[int] = None + lora_A: Optional[torch.Tensor] = None + lora_B: Optional[torch.Tensor] = None + + def peft_state_dict(self) -> dict[str, torch.Tensor]: + if self.lora_A is None or self.lora_B is None: + raise ValueError("At least one of lora_A or lora_B is None, they must both be provided") + return {f"{peft_key}.lora_A.weight": self.lora_A, f"{peft_key}.lora_B.weight": self.lora_A} + + +def construct_peft_loraconfig(info: dict[str, LoRAInfo]) -> LoraConfig: + """Constructs LoraConfig from data extracted from kohya checkpoint + + Args: + info (Dict[str, LoRAInfo]): Information extracted from kohya checkpoint + + Returns: + LoraConfig: config for constructing LoRA + """ + + # Unpack all ranks and alphas + ranks = {x[0]: x[1].rank for x in info.items()} + alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()} + + # Determine which modules needs to be transformed + target_modules = list(info.keys()) + + # Determine most common rank and alpha + r = Counter(ranks.values()).most_common(1)[0] + lora_alpha = Counter(alphas.values()).most_common(1)[0] + + # Determine which modules have different rank and alpha + rank_pattern = dict(filter(lambda x: x[1] != r, ranks.items())) + alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, alphas.items())) + + config = LoraConfig( + r=r, + lora_alpha=lora_alpha, + target_modules=target_modules, + lora_dropout=0.0, + bias="none", + init_lora_weights=False, + rank_pattern=rank_pattern, + alpha_pattern=alpha_pattern, + ) + + return config + + +def combine_peft_state_dict(info: dict[str, LoRAInfo]) -> dict[str, torch.Tensor]: + result = {} + for key_name, key_info in info.items(): + result[f"base_model.model.{key_name}.lora_A.weight"] = key_info.lora_A + result[f"base_model.model.{key_name}.lora_B.weight"] = key_info.lora_B + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--sd_checkpoint", default=None, type=str, required=True, help="SD checkpoint to use") + + parser.add_argument( + "--kohya_lora_path", default=None, type=str, required=True, help="Path to kohya_ss trained LoRA" + ) + + parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") + + parser.add_argument("--half", action="store_true", help="Save weights in half precision.") + args = parser.parse_args() + + # Load all models that we need to add adapter to + text_encoder = CLIPTextModel.from_pretrained(args.sd_checkpoint, subfolder="text_encoder") + unet = UNet2DConditionModel.from_pretrained(args.sd_checkpoint, subfolder="unet") + + # Construct possible mapping from kohya keys to peft keys + models_keys = {} + for model, model_key, model_name in [ + (text_encoder, LORA_PREFIX_TEXT_ENCODER, "text_encoder"), + (unet, LORA_PREFIX_UNET, "unet"), + ]: + models_keys.update( + { + f"{model_key}.{peft_key}".replace(".", "_"): peft_key + for peft_key in (x[0] for x in model.named_modules()) + } + ) + + # Store conversion info (model_type -> peft_key -> LoRAInfo) + lora_info: dict[str, dict[str, LoRAInfo]] = { + "text_encoder": {}, + "unet": {}, + } + + # Open kohya_ss checkpoint + with safetensors.safe_open(args.kohya_lora_path, framework="pt", device="cpu") as f: + # Extract information about LoRA structure + metadata = f.metadata() + + # Iterate through available info and unpack all the values + for key in f.keys(): + kohya_key, kohya_type = key.split(".")[:2] + + # Find which model this key belongs to + if kohya_key.startswith(LORA_PREFIX_TEXT_ENCODER): + model_type = "text_encoder" + elif kohya_key.startswith(LORA_PREFIX_UNET): + model_type = "unet" + else: + raise ValueError(f"Cannot determine model for key: {key}") + + # Find corresponding peft key + if kohya_key not in models_keys: + raise ValueError(f"Cannot find corresponding key for diffusers/transformers model: {kohya_key}") + peft_key = models_keys[kohya_key] + + if peft_key not in lora_info[model_type]: + lora_info[model_type][peft_key] = LoRAInfo(kohya_key=kohya_key, peft_key=peft_key) + + if kohya_type == "alpha": + lora_info[model_type][peft_key].alpha = f.get_tensor(key).item() + elif kohya_type == "lora_down": + tensor = f.get_tensor(key) + lora_info[model_type][peft_key].lora_A = tensor + lora_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "lora_up": + tensor = f.get_tensor(key) + lora_info[model_type][peft_key].lora_B = f.get_tensor(key) + lora_info[model_type][peft_key].rank = tensor.shape[1] + else: + raise ValueError(f"Unknown weight name in key: {key} - {kohya_type}") + + # Process each model + for model, model_name in [(text_encoder, "text_encoder"), (unet, "unet")]: + config = construct_peft_loraconfig(lora_info[model_name]) + model = get_peft_model(model, config) + + keys_peft = list(get_peft_model_state_dict(model).keys()) + keys_new = list(combine_peft_state_dict(lora_info[model_name]).keys()) + + set_peft_model_state_dict(model, combine_peft_state_dict(lora_info[model_name])) + + if args.half: + model.to(torch.float16) + + # Save model to disk + model.save_pretrained(os.path.join(args.dump_path, model_name)) diff --git a/peft/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py b/peft/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py new file mode 100644 index 0000000000000000000000000000000000000000..97f964844ebb79546cbe5fa8dc5d15e48dd11a74 --- /dev/null +++ b/peft/examples/lora_dreambooth/convert_peft_sd_lora_to_kohya_ss.py @@ -0,0 +1,100 @@ +import argparse +import os + +import torch +from diffusers import UNet2DConditionModel +from safetensors.torch import save_file +from transformers import CLIPTextModel + +from peft import PeftModel, get_peft_model_state_dict + + +# Default kohya_ss LoRA replacement modules +# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L664 +LORA_PREFIX_UNET = "lora_unet" +LORA_PREFIX_TEXT_ENCODER = "lora_te" +LORA_ADAPTER_NAME = "default" + + +def get_module_kohya_state_dict( + module: PeftModel, prefix: str, dtype: torch.dtype, adapter_name: str = LORA_ADAPTER_NAME +) -> dict[str, torch.Tensor]: + kohya_ss_state_dict = {} + for peft_key, weight in get_peft_model_state_dict(module, adapter_name=adapter_name).items(): + kohya_key = peft_key.replace("base_model.model", prefix) + kohya_key = kohya_key.replace("lora_A", "lora_down") + kohya_key = kohya_key.replace("lora_B", "lora_up") + kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2) + kohya_ss_state_dict[kohya_key] = weight.to(dtype) + + # Set alpha parameter + if "lora_down" in kohya_key: + alpha_key = f"{kohya_key.split('.')[0]}.alpha" + kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype) + + return kohya_ss_state_dict + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--sd_checkpoint", + default=None, + type=str, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + + parser.add_argument( + "--sd_checkpoint_revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + + parser.add_argument("--peft_lora_path", default=None, type=str, required=True, help="Path to peft trained LoRA") + + parser.add_argument( + "--dump_path", + default=None, + type=str, + required=True, + help="Path to the output safetensors file for use with webui.", + ) + + parser.add_argument("--half", action="store_true", help="Save weights in half precision.") + args = parser.parse_args() + + # Store kohya_ss state dict + kohya_ss_state_dict = {} + dtype = torch.float16 if args.half else torch.float32 + + # Load Text Encoder LoRA model + text_encoder_peft_lora_path = os.path.join(args.peft_lora_path, "text_encoder") + if os.path.exists(text_encoder_peft_lora_path): + text_encoder = CLIPTextModel.from_pretrained( + args.sd_checkpoint, subfolder="text_encoder", revision=args.sd_checkpoint_revision + ) + text_encoder = PeftModel.from_pretrained( + text_encoder, text_encoder_peft_lora_path, adapter_name=LORA_ADAPTER_NAME + ) + kohya_ss_state_dict.update( + get_module_kohya_state_dict(text_encoder, LORA_PREFIX_TEXT_ENCODER, dtype, LORA_ADAPTER_NAME) + ) + + # Load UNet LoRA model + unet_peft_lora_path = os.path.join(args.peft_lora_path, "unet") + if os.path.exists(unet_peft_lora_path): + unet = UNet2DConditionModel.from_pretrained( + args.sd_checkpoint, subfolder="unet", revision=args.sd_checkpoint_revision + ) + unet = PeftModel.from_pretrained(unet, unet_peft_lora_path, adapter_name=LORA_ADAPTER_NAME) + kohya_ss_state_dict.update(get_module_kohya_state_dict(unet, LORA_PREFIX_UNET, dtype, LORA_ADAPTER_NAME)) + + # Save state dict + save_file( + kohya_ss_state_dict, + args.dump_path, + ) diff --git a/peft/examples/lora_dreambooth/lora_dreambooth_inference.ipynb b/peft/examples/lora_dreambooth/lora_dreambooth_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a94358c6f11927f8f3006e8e8f8cdd4a0791535d --- /dev/null +++ b/peft/examples/lora_dreambooth/lora_dreambooth_inference.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "acab479f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "===================================BUG REPORT===================================\n", + "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "================================================================================\n" + ] + } + ], + "source": [ + "import argparse\n", + "import gc\n", + "import hashlib\n", + "import itertools\n", + "import logging\n", + "import math\n", + "import os\n", + "import threading\n", + "import warnings\n", + "from pathlib import Path\n", + "from typing import Optional\n", + "import psutil\n", + "import json\n", + "\n", + "import torch\n", + "import torch.nn.functional as F\n", + "import torch.utils.checkpoint\n", + "from torch.utils.data import Dataset\n", + "\n", + "import datasets\n", + "import diffusers\n", + "import transformers\n", + "from accelerate import Accelerator\n", + "from accelerate.logging import get_logger\n", + "from accelerate.utils import set_seed\n", + "from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel\n", + "from diffusers import DDPMScheduler, PNDMScheduler, StableDiffusionPipeline\n", + "from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker\n", + "from diffusers.optimization import get_scheduler\n", + "from diffusers.utils import check_min_version\n", + "from diffusers.utils.import_utils import is_xformers_available\n", + "from huggingface_hub import HfFolder, Repository, whoami\n", + "from PIL import Image\n", + "from torchvision import transforms\n", + "from tqdm.auto import tqdm\n", + "from transformers import AutoTokenizer, PretrainedConfig, CLIPFeatureExtractor\n", + "from peft import PeftModel, LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict\n", + "\n", + "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n", + "check_min_version(\"0.10.0.dev0\")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "\n", + "MODEL_NAME = \"CompVis/stable-diffusion-v1-4\" # \"stabilityai/stable-diffusion-2-1-base\"\n", + "INSTANCE_PROMPT = \"a photo of sks dog\"\n", + "base_path = \"/home/sourab/temp/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06cfd506", + "metadata": {}, + "outputs": [], + "source": [ + "def get_lora_sd_pipeline(\n", + " ckpt_dir, base_model_name_or_path=None, dtype=torch.float16, device=\"auto\", adapter_name=\"default\"\n", + "):\n", + " if device == \"auto\":\n", + " device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + " unet_sub_dir = os.path.join(ckpt_dir, \"unet\")\n", + " text_encoder_sub_dir = os.path.join(ckpt_dir, \"text_encoder\")\n", + " if os.path.exists(text_encoder_sub_dir) and base_model_name_or_path is None:\n", + " config = LoraConfig.from_pretrained(text_encoder_sub_dir)\n", + " base_model_name_or_path = config.base_model_name_or_path\n", + "\n", + " if base_model_name_or_path is None:\n", + " raise ValueError(\"Please specify the base model name or path\")\n", + "\n", + " pipe = StableDiffusionPipeline.from_pretrained(\n", + " base_model_name_or_path, torch_dtype=dtype, requires_safety_checker=False\n", + " ).to(device)\n", + " pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)\n", + "\n", + " if os.path.exists(text_encoder_sub_dir):\n", + " pipe.text_encoder = PeftModel.from_pretrained(\n", + " pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name\n", + " )\n", + "\n", + " if dtype in (torch.float16, torch.bfloat16):\n", + " pipe.unet.half()\n", + " pipe.text_encoder.half()\n", + "\n", + " pipe.to(device)\n", + " return pipe\n", + "\n", + "\n", + "def load_adapter(pipe, ckpt_dir, adapter_name):\n", + " unet_sub_dir = os.path.join(ckpt_dir, \"unet\")\n", + " text_encoder_sub_dir = os.path.join(ckpt_dir, \"text_encoder\")\n", + " pipe.unet.load_adapter(unet_sub_dir, adapter_name=adapter_name)\n", + " if os.path.exists(text_encoder_sub_dir):\n", + " pipe.text_encoder.load_adapter(text_encoder_sub_dir, adapter_name=adapter_name)\n", + "\n", + "\n", + "def set_adapter(pipe, adapter_name):\n", + " pipe.unet.set_adapter(adapter_name)\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.set_adapter(adapter_name)\n", + "\n", + "\n", + "def merging_lora_with_base(pipe, ckpt_dir, adapter_name=\"default\"):\n", + " unet_sub_dir = os.path.join(ckpt_dir, \"unet\")\n", + " text_encoder_sub_dir = os.path.join(ckpt_dir, \"text_encoder\")\n", + " if isinstance(pipe.unet, PeftModel):\n", + " pipe.unet.set_adapter(adapter_name)\n", + " else:\n", + " pipe.unet = PeftModel.from_pretrained(pipe.unet, unet_sub_dir, adapter_name=adapter_name)\n", + " pipe.unet = pipe.unet.merge_and_unload()\n", + "\n", + " if os.path.exists(text_encoder_sub_dir):\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.set_adapter(adapter_name)\n", + " else:\n", + " pipe.text_encoder = PeftModel.from_pretrained(\n", + " pipe.text_encoder, text_encoder_sub_dir, adapter_name=adapter_name\n", + " )\n", + " pipe.text_encoder = pipe.text_encoder.merge_and_unload()\n", + "\n", + " return pipe\n", + "\n", + "\n", + "def create_weighted_lora_adapter(pipe, adapters, weights, adapter_name=\"default\"):\n", + " pipe.unet.add_weighted_adapter(adapters, weights, adapter_name)\n", + " if isinstance(pipe.text_encoder, PeftModel):\n", + " pipe.text_encoder.add_weighted_adapter(adapters, weights, adapter_name)\n", + "\n", + " return pipe" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d4e888d2", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9f12b2cca0784cba9dc14ec48de929d5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 19 files: 0%| | 0/19 [00:00" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"sks dog playing fetch in the park\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1e1d1f30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7.74 ms, sys: 0 ns, total: 7.74 ms\n", + "Wall time: 7.31 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "set_adapter(pipe, adapter_name=\"toy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0c50c03d", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4f978437709b44b391744cf972415027", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"narendra modi rendered in the style of <1>\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "e3b9a681", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8871731127904802ba6123128ba22ecf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set_adapter(pipe, adapter_name=\"dog\")\n", + "prompt = \"sks dog in a big red bucket\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a659ca6e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b701336557e4417d8d39d500699c298b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set_adapter(pipe, adapter_name=\"toy\")\n", + "prompt = \"superman rendered in the style of <1>, close up potrait\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1f0ecb40", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b47385e953b14c768d82bea776129904", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/50 [00:00" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set_adapter(pipe, adapter_name=\"toy_dog\")\n", + "prompt = \"sks dog rendered in the style of <1>, close up potrait, 4K HD\"\n", + "negative_prompt = \"low quality, blurry, unfinished\"\n", + "image = pipe(prompt, num_inference_steps=50, guidance_scale=7, negative_prompt=negative_prompt).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29720cdb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/lora_dreambooth/requirements.txt b/peft/examples/lora_dreambooth/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec1a3dcd9922d0a01cb636f5cd0c57f6aac5b3d3 --- /dev/null +++ b/peft/examples/lora_dreambooth/requirements.txt @@ -0,0 +1,11 @@ +transformers +accelerate +evaluate +tqdm +datasets +diffusers +Pillow +torchvision +huggingface_hub +safetensors +wandb \ No newline at end of file diff --git a/peft/examples/lora_dreambooth/train_dreambooth.py b/peft/examples/lora_dreambooth/train_dreambooth.py new file mode 100644 index 0000000000000000000000000000000000000000..2bbc39cc758701e9eeac77061034e7e0417c915e --- /dev/null +++ b/peft/examples/lora_dreambooth/train_dreambooth.py @@ -0,0 +1,1107 @@ +import argparse +import gc +import hashlib +import itertools +import logging +import math +import os +import threading +import warnings +from contextlib import nullcontext +from pathlib import Path + +import datasets +import diffusers +import numpy as np +import psutil +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import HfApi +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +from peft import LoraConfig, get_peft_model + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.10.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = ["to_q", "to_v", "query", "value"] # , "ff.net.0.proj"] +TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"] + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + # lora args + parser.add_argument("--use_lora", action="store_true", help="Whether to use Lora for parameter efficient tuning") + parser.add_argument("--lora_r", type=int, default=8, help="Lora rank, only used if use_lora is True") + parser.add_argument("--lora_alpha", type=int, default=32, help="Lora alpha, only used if use_lora is True") + parser.add_argument("--lora_dropout", type=float, default=0.0, help="Lora dropout, only used if use_lora is True") + parser.add_argument( + "--lora_bias", + type=str, + default="none", + help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora is True", + ) + parser.add_argument( + "--lora_text_encoder_r", + type=int, + default=8, + help="Lora rank for text encoder, only used if `use_lora` and `train_text_encoder` are True", + ) + parser.add_argument( + "--lora_text_encoder_alpha", + type=int, + default=32, + help="Lora alpha for text encoder, only used if `use_lora` and `train_text_encoder` are True", + ) + parser.add_argument( + "--lora_text_encoder_dropout", + type=float, + default=0.0, + help="Lora dropout for text encoder, only used if `use_lora` and `train_text_encoder` are True", + ) + parser.add_argument( + "--lora_text_encoder_bias", + type=str, + default="none", + help="Bias type for Lora. Can be 'none', 'all' or 'lora_only', only used if use_lora and `train_text_encoder` are True", + ) + + parser.add_argument( + "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader." + ) + + parser.add_argument( + "--no_tracemalloc", + default=False, + action="store_true", + help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.", + ) + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of accelerators, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU or Intel XPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU or Intel XPU. Default to fp16 if a GPU is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + gc.collect() + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def collate_fn(examples, with_prior_preservation=False): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple accelerators." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + if args.report_to == "wandb": + import wandb + + wandb.login(key=args.wandb_key) + wandb.init(project=args.wandb_project_name) + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDPMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + if args.use_lora: + config = LoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + target_modules=UNET_TARGET_MODULES, + lora_dropout=args.lora_dropout, + bias=args.lora_bias, + ) + unet = get_peft_model(unet, config) + unet.print_trainable_parameters() + print(unet) + + vae.requires_grad_(False) + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + elif args.train_text_encoder and args.use_lora: + config = LoraConfig( + r=args.lora_text_encoder_r, + lora_alpha=args.lora_text_encoder_alpha, + target_modules=TEXT_ENCODER_TARGET_MODULES, + lora_dropout=args.lora_text_encoder_dropout, + bias=args.lora_text_encoder_bias, + ) + text_encoder = get_peft_model(text_encoder, config) + text_encoder.print_trainable_parameters() + print(text_encoder) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warn("XPU hasn't support xformers yet, ignore it.") + elif is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + # below fails when using lora so commenting it out + if args.train_text_encoder and not args.use_lora: + text_encoder.gradient_checkpointing_enable() + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32 and torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB accelerators + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.num_dataloader_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("dreambooth", config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the mos recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = resume_global_step // num_update_steps_per_epoch + resume_step = resume_global_step % num_update_steps_per_epoch + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + if args.train_text_encoder: + text_encoder.train() + with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + # if global_step % args.checkpointing_steps == 0: + # if accelerator.is_main_process: + # save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + # accelerator.save_state(save_path) + # logger.info(f"Saved state to {save_path}") + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if ( + args.validation_prompt is not None + and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0 + ): + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + revision=args.revision, + ) + # set `keep_fp32_wrapper` to True because we do not want to remove + # mixed precision hooks while we are still training + pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True) + pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + if args.seed is not None: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + else: + generator = None + images = [] + for _ in range(args.num_validation_images): + image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + import wandb + + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + if global_step >= args.max_train_steps: + break + + # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage + if not args.no_tracemalloc: + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + # Create the pipeline using using the trained modules and save it. + accelerator.wait_for_everyone() + if accelerator.is_main_process: + if args.use_lora: + unwarpped_unet = accelerator.unwrap_model(unet) + unwarpped_unet.save_pretrained( + os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet) + ) + if args.train_text_encoder: + unwarpped_text_encoder = accelerator.unwrap_model(text_encoder) + unwarpped_text_encoder.save_pretrained( + os.path.join(args.output_dir, "text_encoder"), + state_dict=accelerator.get_state_dict(text_encoder), + ) + else: + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + revision=args.revision, + ) + pipeline.save_pretrained(args.output_dir) + + if args.push_to_hub: + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + run_as_future=True, + ) + + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/lorafa_finetune/README.md b/peft/examples/lorafa_finetune/README.md new file mode 100644 index 0000000000000000000000000000000000000000..432c93ad837c90684c0c5e58b6149372b77c7712 --- /dev/null +++ b/peft/examples/lorafa_finetune/README.md @@ -0,0 +1,121 @@ +# LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models Fine-tuning + +## Introduction + +[LoRA-FA](https://huggingface.co/papers/2308.03303) is a noval Parameter-efficient Fine-tuning method, which freezes the projection down layer (matrix A) during LoRA training process and thus lead to less accelerator memory consumption by eliminating the need for storing the activations of input tensors (X). Furthermore, LoRA-FA narrows the gap between the update amount of pre-trained weights when using the low-rank fine-tuning method and the full fine-tuning method. In conclusion, LoRA-FA reduces the memory consumption and leads to superior performance compared to vanilla LoRA. + +## Quick start + +```python +import torch +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_lorafa_optimizer +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") +dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") + +lora_rank = 16 +lora_alpha = 32 + +lora_config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + bias="none", +) +peft_model = get_peft_model(model, lora_config) +optimizer = create_lorafa_optimizer( + model=peft_model, + r=lora_rank, + lora_alpha=lora_alpha, + lr=7e-5, +) +# you can also use scheduler, we recommend get_cosine_schedule_with_warmup from transformers +# for better model performance +scheduler = None + +trainer = transformers.Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + processing_class=tokenizer, + optimizers=(optimizer, None), +) +trainer.train() +peft_model.save_pretrained("lorafa-llama-3-8b-inst") +``` + +The only change in your code is to pass the LoRA-FA optimizer to the trainer (if training with trainer). Do not forget `from peft.optimizers import create_lorafa_optimizer`! + +## Example + +In this dir, we also provide you a simple example for fine-tuning with LoRA-FA optimizer. + +### Run on CPU, single-accelerator or multi-accelerator + +This 👇 by default will load the model in peft set up with LoRA config, and train the model with LoRA-FA optimizer. + +0. CPU + +You can simply run LoRA-FA as below: + +```bash +python lorafa_finetuning.py --base_model_name_or_path meta-llama/Meta-Llama-3-8B --dataset_name_or_path meta-math/MetaMathQA-40K --output_dir path/to/output --lorafa +``` + +1. Single-accelerator + +Run the finetuning script on 1 accelerator: + +```bash +export CUDA_VISIBLE_DEVICES=0 # force to use CUDA GPU 0 +export ZE_AFFINITY_MASK=0 # force to use Intel XPU 0 + +python lorafa_finetuning.py --base_model_name_or_path meta-llama/Meta-Llama-3-8B --dataset_name_or_path meta-math/MetaMathQA-40K --output_dir path/to/output --lorafa +``` + +2. Multi-accelerator + +LoRA-FA can also be run on multi-accelerator, with 🤗 Accelerate: + +```bash +export CUDA_VISIBLE_DEVICES=0,1,2,3 # force to use CUDA GPU 0,1,2,3 +export ZE_AFFINITY_MASK=0,1,2,3 # force to use Intel XPU 0,1,2,3 + +accelerate launch lorafa_finetuning.py --base_model_name_or_path meta-llama/Meta-Llama-3-8B --dataset_name_or_path meta-math/MetaMathQA-40K --output_dir path/to/output --lorafa +``` + +The `accelerate launch` will automatically configure multi-accelerator for you. You can also utilize `accelerate launch` in single-accelerator scenario. + +### Use the model from 🤗 +You can load and use the model as any other 🤗 models. +```python +from transformers import AutoModel +model = AutoModel.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +``` + +## Best practice in fine-tuning Llama using LoRA-FA: the hyper-params + +Sometimes, achieving optimal LoRA fine-tuning can be challenging due to the larger number of hyperparameters to consider compared to full fine-tuning. For instance, not only do we need to adjust the commonly used learning rate, but the ideal LoRA rank may also vary depending on the specific model and task. Additionally, there are other factors to consider, such as LoRA alpha and sequence length. To assist with this, we have created a repository of reproducible best practices in the [LoRA-FA examples](https://github.com/AaronZLT/lorafa) for reference. This resource showcases the optimal LoRA-FA fine-tuning hyperparameters for different models across various datasets. By doing so, we significantly reduce the time and effort spent on hyperparameter tuning, and it may also provide insights for tuning other training hyperparameters. We encourage you to experiment and fine-tune on your own downstream tasks as well. + +## LoRA-FA's advantages and limitations + +By eliminating the activation of adapter A, LoRA-FA uses less memory for fine-tuning compared to LoRA. For instance, when fine-tuning Llama-2-7b-chat-hf with a batch size of 8 and a sequence length of 1024, LoRA-FA requires 36GB of memory to store activations. This allows it to run successfully on an 80GB accelerator. In contrast, LoRA requires at least 60GB of memory for activations, leading to an Out of Memory (OOM) error. Additionally, the memory consumption of LoRA-FA is not sensitive to the rank, allowing for performance improvements by increasing the LoRA rank without additional memory usage. LoRA-FA further narrows the performance gap with Full-FT by minimizing the discrepancy between the low-rank gradient and the full gradient, enabling it to achieve performance that is on par with or even superior to vanilla LoRA. + +Despite its advantages, LoRA-FA is inherently limited by its low-rank approximation nature and potential issues with catastrophic forgetting. The gradient approximation can impact training throughput. Addressing these limitations, especially in terms of approximation accuracy and forgetting phenomena, presents a promising direction for future research. + +## Citation +``` +@misc{zhang2023lorafamemoryefficientlowrankadaptation, + title={LoRA-FA: Memory-efficient Low-rank Adaptation for Large Language Models Fine-tuning}, + author={Longteng Zhang and Lin Zhang and Shaohuai Shi and Xiaowen Chu and Bo Li}, + year={2023}, + eprint={2308.03303}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://huggingface.co/papers/2308.03303}, +} +``` diff --git a/peft/examples/lorafa_finetune/lorafa_finetuning.py b/peft/examples/lorafa_finetune/lorafa_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..19091df167e0f7c451eafe40cf9e4cca24685168 --- /dev/null +++ b/peft/examples/lorafa_finetune/lorafa_finetuning.py @@ -0,0 +1,221 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Optional + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from peft.optimizers import create_lorafa_optimizer + + +def train_model( + base_model_name_or_path: str, + dataset_name_or_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + lr: float, + cutoff_len: int, + quantize: bool, + eval_step: int, + save_step: int, + lora_rank: int, + lora_alpha: int, + lora_dropout: float, + lora_target_modules: Optional[str], + lorafa: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + is_bf16_supported = False + device_map = "cpu" + if torch.cuda.is_available(): + is_bf16_supported = torch.cuda.is_bf16_supported() + device_map = "cuda" + elif torch.xpu.is_available(): + is_bf16_supported = torch.xpu.is_bf16_supported() + device_map = "xpu" + compute_dtype = torch.bfloat16 if is_bf16_supported else torch.float16 + + # load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path) + + # load model + if quantize: + model = AutoModelForCausalLM.from_pretrained( + base_model_name_or_path, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=False, + bnb_4bit_quant_type="nf4", + ), + torch_dtype=compute_dtype, + device_map=device_map, + ) + # setup for quantized training + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained( + base_model_name_or_path, torch_dtype=compute_dtype, device_map=device_map + ) + + # LoRA config for the PEFT model + if lora_target_modules is not None: + if lora_target_modules == "all-linear": + target_modules = "all-linear" + else: + target_modules = lora_target_modules.split(",") + else: + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + + lora_config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=target_modules, + lora_dropout=lora_dropout, + bias="none", + ) + + # get the peft model with LoRA config + model = get_peft_model(model, lora_config) + + tokenizer.pad_token = tokenizer.eos_token + + # Load the dataset + dataset = load_dataset(dataset_name_or_path) + + def tokenize_function(examples): + inputs = tokenizer(examples["query"], padding="max_length", truncation=True, max_length=cutoff_len) + outputs = tokenizer(examples["response"], padding="max_length", truncation=True, max_length=cutoff_len) + inputs["labels"] = outputs["input_ids"].copy() + return inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + dataset = tokenized_datasets["train"].train_test_split(test_size=0.1, shuffle=True, seed=42) + train_dataset = dataset["train"] + eval_dataset = dataset["test"] + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Define training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + gradient_accumulation_steps=1, + bf16=True if compute_dtype == torch.bfloat16 else False, + fp16=True if compute_dtype == torch.float16 else False, + learning_rate=lr, + ) + + # Here we initialize the LoRA-FA Optimizer + # After this, all adapter A will be fixed, only adapter B will be trainable + if lorafa: + optimizer = create_lorafa_optimizer( + model=model, r=lora_rank, lora_alpha=lora_alpha, lr=lr, weight_decay=training_args.weight_decay + ) + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=data_collator, + optimizers=(optimizer, None), + ) + else: + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=data_collator, + ) + + # Start model training + trainer.train() + + # Save the model and tokenizer locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune Meta-Llama-3-8B-Instruct with LoRA-FA and PEFT") + parser.add_argument( + "--base_model_name_or_path", + type=str, + default="meta-llama/Meta-Llama-3-8B-Instruct", + help="Base model name or path", + ) + parser.add_argument( + "--dataset_name_or_path", type=str, default="meta-math/MetaMathQA-40K", help="Dataset name or path" + ) + parser.add_argument("--output_dir", type=str, help="Output directory for the fine-tuned model") + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=3, help="Number of training epochs") + parser.add_argument("--lr", type=float, default=7e-5, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=1024, help="Cutoff length for tokenization") + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--lora_rank", type=int, default=16, help="LoRA rank") + parser.add_argument("--lora_alpha", type=int, default=32, help="LoRA alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + parser.add_argument("--lorafa", action="store_true", help="Use LoRA-FA Optimizer") + + args = parser.parse_args() + + train_model( + base_model_name_or_path=args.base_model_name_or_path, + dataset_name_or_path=args.dataset_name_or_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + lr=args.lr, + cutoff_len=args.cutoff_len, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + lora_rank=args.lora_rank, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + lorafa=args.lorafa, + ) diff --git a/peft/examples/miss_finetuning/README.md b/peft/examples/miss_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d08719ca8f0e575478c39a1f7a45f0f2d954f7fe --- /dev/null +++ b/peft/examples/miss_finetuning/README.md @@ -0,0 +1,104 @@ +# MiSS: Balancing LoRA Performance and Efficiency with Simple Shard Sharing +## Introduction ([Paper](https://huggingface.co/papers/2409.15371), [code](https://github.com/JL-er/MiSS)) +MiSS (Matrix Shard Sharing) is a novel PEFT method that adopts a low-rank structure, requires only a single trainable matrix, and introduces a new update mechanism distinct from LoRA, achieving an excellent balance between performance and efficiency. + + +## Quick Start +```python +import torch +from peft import MissConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +tokenizer.pad_token_id = tokenizer.eos_token_id + +miss_config = MissConfig( + r = 64 +) +#bat: In this mode, you can enable nonlinear updates across different shards. +# miss_config = MissConfig( +# r = 64, +# init_weights="bat" +# ) + +# mini: In this mode, you can set a smaller rank to use fewer trainable parameters, but it is recommended to keep `out_features % mini_r == 0`. +# miss_config = MissConfig( +# r = 64, +# init_weights="mini", +# mini_r = 8 +# ) +peft_model = get_peft_model(model, miss_config) + +peft_model.print_trainable_parameters() + +dataset = load_dataset("imdb", split="train[:1%]") + +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + args=training_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("miss-llama-2-7b") +``` + + +To utilize the fine-tuned MiSS modules, simply run the following command: +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto" +) +peft_model = PeftModel.from_pretrained(model, "miss-llama-2-7b") +``` + +## Advanced Usage + +### Fine-tune +```shell +#Bat performs better than MiSS, but it uses more memory and is twice as slow. If you want to use the Bat method, you only need to add the parameter init_weights="bat". +python miss_finetuning.py \ + --base_model_name_or_path meta-llama/Llama-2-7b-hf \ + --output_dir output/miss-llama-2-7b-metamath-10k \ + --miss_r 64 \ + --init_weights True \ + --bits bf16 \ + --data_path meta-math/MetaMathQA \ + --dataset_split train[:100000] \ + --dataset_field query response \ + --bf16 True \ + --num_train_epochs 1 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 8 \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 1 \ + --logging_steps 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --tf32 True \ + --report_to none +``` + + + +# Citation +```bib +@misc{kang2025balancingloraperformanceefficiency, + title={Balancing LoRA Performance and Efficiency with Simple Shard Sharing}, + author={Jiale Kang and Qingyu Yin}, + year={2025}, + eprint={2409.15371}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2409.15371}, +} diff --git a/peft/examples/miss_finetuning/miss_finetuning.py b/peft/examples/miss_finetuning/miss_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..27d852d071495aa7022e2fe1c3bcaa7402b3a29d --- /dev/null +++ b/peft/examples/miss_finetuning/miss_finetuning.py @@ -0,0 +1,107 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass, field +from typing import Literal, Optional + +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser +from trl import SFTConfig, SFTTrainer + +from peft import MissConfig, get_peft_model + + +@dataclass +class ScriptArguments(SFTConfig): + # model configs + base_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "The name or path of the fp32/16 base model."} + ) + bits: str = field(default="bf16", metadata={"help": "(`['bf16', 'fp16', fp32]`)"}) + init_weights: Literal[True, "bat"] = field( + default=True, + metadata={ + "help": ( + "True -> MiSS efficience and balance; `bat` -> Bat, `mini` -> smaller MiSS efficience and balance" + ), + }, + ) + miss_r: int = field(default=16) + merge_and_save: bool = field(default=False) + # dataset configs + data_path: str = field(default="imdb", metadata={"help": "Path to the training data."}) + dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"}) + dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."}) + + +parser = HfArgumentParser(ScriptArguments) +script_args = parser.parse_args_into_dataclasses()[0] +print(script_args) + +print(f"Load pre-processed residual model in {script_args.bits} bits.") +if script_args.bits in ["nf4", "fp4", "int8"]: + print("MiSS currently does not support quantization.") + +elif script_args.base_model_name_or_path is not None: + print(f"No available pre-processed model, manually initialize a MiSS using {script_args.base_model_name_or_path}.") + model = AutoModelForCausalLM.from_pretrained( + script_args.base_model_name_or_path, + torch_dtype=( + torch.float16 + if script_args.bits == "fp16" + else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32) + ), + device_map="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path) + tokenizer.pad_token_id = tokenizer.eos_token_id + miss_config = MissConfig( + r=script_args.miss_r, + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + bias="none", + task_type="CAUSAL_LM", + init_weights=script_args.init_weights, + ) + peft_model = get_peft_model(model, miss_config) + +print(peft_model) +peft_model.print_trainable_parameters() + +print(f"Training MiSS with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.") +dataset = load_dataset(script_args.data_path, split=script_args.dataset_split) +dataset = dataset.map( + lambda example: { + "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}" + } +) + +trainer = SFTTrainer( + model=peft_model, + args=script_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +trainer.save_state() + +peft_model.save_pretrained( + os.path.join(script_args.output_dir, "miss_ft"), +) + +if script_args.merge_and_save: + model = peft_model.merge_and_unload() + model.save_pretrained(os.path.join(script_args.output_dir, "miss_merged")) + tokenizer.save_pretrained(os.path.join(script_args.output_dir, "miss_merged")) diff --git a/peft/examples/multi_adapter_examples/Lora_Merging.ipynb b/peft/examples/multi_adapter_examples/Lora_Merging.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7778c1a21a6ec1ab4e7f4aa0a029eb591728054d --- /dev/null +++ b/peft/examples/multi_adapter_examples/Lora_Merging.ipynb @@ -0,0 +1,194 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "id": "db4208b9-5da4-46df-b77a-0f1836c9e4ec", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\" # force using CUDA device 1\n", + "os.environ[\"ZE_AFFINITY_MASK\"] = \"1\" # force using Intel XPU device 1\n", + "from peft import PeftConfig, PeftModel\n", + "from peft import PeftModel, PeftConfig\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n", + "from datasets import load_dataset\n", + "import torch\n", + "import random\n", + "\n", + "peft_model_id = \"smangrul/tinyllama_lora_norobots\"\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "config = PeftConfig.from_pretrained(peft_model_id)\n", + "model_kwargs = {\"device_map\": \"auto\"}\n", + "model_kwargs[\"quantization_config\"] = BitsAndBytesConfig(load_in_4bit=True)\n", + "model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, **model_kwargs)\n", + "tokenizer = AutoTokenizer.from_pretrained(peft_model_id)\n", + "model.resize_token_embeddings(len(tokenizer))\n", + "model = PeftModel.from_pretrained(model, peft_model_id, adapter_name=\"norobots\")\n", + "_ = model.load_adapter(\"smangrul/tinyllama_lora_sql\", adapter_name=\"sql\")\n", + "_ = model.load_adapter(\"smangrul/tinyllama_lora_adcopy\", adapter_name=\"adcopy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "541dab43-9675-42a2-8d90-7437df9f0fa0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 17.1 s, sys: 458 ms, total: 17.5 s\n", + "Wall time: 1.94 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# [0.8, 0.1, 0.1] linear #[1.0, 0.2] 0.7 density dare_linear #[1.5, 0.3] 0.5 density ties #[0.8, 0.5] cat\n", + "adapters = [\"norobots\", \"adcopy\", \"sql\"]\n", + "weights = [2.0, 0.3, 0.7]\n", + "adapter_name = \"merge\"\n", + "density = 0.2\n", + "combination_type = \"ties\"\n", + "if adapter_name in model.peft_config:\n", + " model.delete_adapter(adapter_name)\n", + "model.add_weighted_adapter(adapters, weights, adapter_name, combination_type=combination_type, density=density)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "76596671-3677-47f0-9d66-81f40bc4d726", + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "model.set_adapter(\"merge\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d59f9f3-6313-43d8-be36-4ca2bbb105b2", + "metadata": {}, + "outputs": [], + "source": [ + "messages = [\n", + " {\"role\": \"user\", \"content\": \"Write an essay about Generative AI.\"},\n", + "]\n", + "text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n", + "inputs = tokenizer(text, return_tensors=\"pt\") # , add_special_tokens=False)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()}\n", + "outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=256,\n", + " do_sample=True,\n", + " top_p=0.95,\n", + " temperature=0.2,\n", + " repetition_penalty=1.2,\n", + " eos_token_id=tokenizer.eos_token_id,\n", + ")\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e5c1daeb-59c8-41d7-bebb-7abd052ab917", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<|im_start|>system \n", + "Create a text ad given the following product and description.<|im_end|> \n", + "<|im_start|>user \n", + "Product: Sony PS5 PlayStation Console\n", + "Description: The PS5™ console unleashes new gaming possibilities that you never anticipated.<|im_end|> \n", + "<|im_start|>assistant \n", + "Ad Text: Experience the next-gen power of the all-new Sony PS5 with its stunning visuals, innovative gameplay features, and more! Get ready to play in style as you experience the future of gaming on your own terms.<|im_end|>\n" + ] + } + ], + "source": [ + "messages = [\n", + " {\"role\": \"system\", \"content\": \"Create a text ad given the following product and description.\"},\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Product: Sony PS5 PlayStation Console\\nDescription: The PS5™ console unleashes new gaming possibilities that you never anticipated.\",\n", + " },\n", + "]\n", + "text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n", + "inputs = tokenizer(text, return_tensors=\"pt\") # , add_special_tokens=False)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()}\n", + "outputs = model.generate(\n", + " **inputs,\n", + " max_new_tokens=128,\n", + " do_sample=True,\n", + " top_p=0.95,\n", + " temperature=0.2,\n", + " repetition_penalty=1.2,\n", + " eos_token_id=tokenizer.eos_token_id,\n", + ")\n", + "print(tokenizer.decode(outputs[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bb08b46-90ae-48a8-8783-ca74b3e26e42", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Table: 2-11365528-2\n", + "Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']\n", + "Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?\n", + "SQL Query: SELECT Head Coach FROM 2-11365528-2 WHERE President = Mario Volarevic\n" + ] + } + ], + "source": [ + "text = \"\"\"Table: 2-11365528-2\n", + "Columns: ['Team', 'Head Coach', 'President', 'Home Ground', 'Location']\n", + "Natural Query: Who is the Head Coach of the team whose President is Mario Volarevic?\n", + "SQL Query:\"\"\"\n", + "\n", + "inputs = tokenizer(text, return_tensors=\"pt\") # , add_special_tokens=False)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()}\n", + "outputs = model.generate(\n", + " **inputs, max_new_tokens=64, repetition_penalty=1.1, eos_token_id=tokenizer(\"\").input_ids[-1]\n", + ")\n", + "print(tokenizer.decode(outputs[0]))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb b/peft/examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3815a360a779206c18c810197c193523c0f8dcab --- /dev/null +++ b/peft/examples/multi_adapter_examples/PEFT_Multi_LoRA_Inference.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "jONLwzXgLg-I", + "metadata": { + "id": "jONLwzXgLg-I" + }, + "outputs": [], + "source": [ + "!pip install -q git+https://github.com/huggingface/transformers.git\n", + "!pip install -q git+https://github.com/huggingface/peft.git\n", + "!pip install -q git+https://github.com/huggingface/accelerate.git@main\n", + "!pip install huggingface_hub\n", + "!pip install bitsandbytes\n", + "!pip install SentencePiece" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36460935", + "metadata": { + "id": "36460935" + }, + "outputs": [], + "source": [ + "import os\n", + "import torch\n", + "\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\" # force using CUDA device 0\n", + "os.environ[\"ZE_AFFINITY_MASK\"] = \"0\" # force using Intel XPU device 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1351e04c", + "metadata": { + "id": "1351e04c" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d85af699", + "metadata": { + "id": "d85af699" + }, + "outputs": [], + "source": [ + "from peft import PeftModel\n", + "from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, BitsAndBytesConfig\n", + "\n", + "model_name = \"meta-llama/Llama-2-7b-hf\"\n", + "tokenizer = LlamaTokenizer.from_pretrained(model_name)\n", + "model = LlamaForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map=\"auto\", use_auth_token=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0f515ed", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "f0f515ed", + "outputId": "312488a5-f4f8-48a4-8c63-7b4a59e80418" + }, + "outputs": [], + "source": [ + "%%time\n", + "model = PeftModel.from_pretrained(model, \"tloen/alpaca-lora-7b\", adapter_name=\"eng_alpaca\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67a0c121", + "metadata": { + "id": "67a0c121" + }, + "outputs": [], + "source": [ + "%%time\n", + "model.load_adapter(\"22h/cabrita-lora-v0-1\", adapter_name=\"portuguese_alpaca\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b655fca", + "metadata": { + "id": "4b655fca" + }, + "outputs": [], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9ebd572", + "metadata": { + "id": "e9ebd572" + }, + "outputs": [], + "source": [ + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + "model.to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "138805b3", + "metadata": { + "id": "138805b3" + }, + "outputs": [], + "source": [ + "def generate_prompt(instruction, input=None):\n", + " if input:\n", + " return f\"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", + "### Instruction:\n", + "{instruction}\n", + "### Input:\n", + "{input}\n", + "### Response:\"\"\"\n", + " else:\n", + " return f\"\"\"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n", + "### Instruction:\n", + "{instruction}\n", + "### Response:\"\"\"\n", + "\n", + "\n", + "def evaluate(\n", + " instruction,\n", + " input=None,\n", + " temperature=0.1,\n", + " top_p=0.75,\n", + " top_k=40,\n", + " num_beams=4,\n", + " max_new_tokens=256,\n", + " **kwargs,\n", + "):\n", + " prompt = generate_prompt(instruction, input)\n", + " inputs = tokenizer(prompt, return_tensors=\"pt\")\n", + " input_ids = inputs[\"input_ids\"].to(device)\n", + " generation_config = GenerationConfig(\n", + " temperature=temperature,\n", + " top_p=top_p,\n", + " top_k=top_k,\n", + " num_beams=num_beams,\n", + " no_repeat_ngram_size=3,\n", + " **kwargs,\n", + " )\n", + "\n", + " with torch.no_grad():\n", + " generation_output = model.generate(\n", + " input_ids=input_ids,\n", + " generation_config=generation_config,\n", + " return_dict_in_generate=True,\n", + " output_scores=True,\n", + " max_new_tokens=max_new_tokens,\n", + " )\n", + " s = generation_output.sequences[0]\n", + " output = tokenizer.decode(s)\n", + " return output.split(\"### Response:\")[1].strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fd5e6b3b", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fd5e6b3b", + "outputId": "ec72241b-c427-4258-b02f-2101df0d171a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.16 ms, sys: 443 μs, total: 5.6 ms\n", + "Wall time: 5.58 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "model.set_adapter(\"eng_alpaca\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "33650851", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "33650851", + "outputId": "aae24052-0f09-4812-88c3-6fb53dec656c" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n", + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The alpaca (Vicugna pacos) is a domesticated species of South American camelid. It resembles a small llama in appearance. It is kept in herds that graze on the level heights of the Andes of southern Peru, southern Bolivia, Ecuador, and northern Chile, at an altitude of about 3,800 m (12,500 ft) to 5,000 meters (16,404 ft). It is bred for its fiber, which is similar to sheep's wool but finer, silkier, and more durable. Alpaca fiber is used for making knitted and woven items, such as sweaters, hats, gloves, scarves, a variety of textiles, rugs, and blankets. The wool can be dyed, and is used to make ponchos, blankets, and sweaters in Peru and other Andean countries. The animals are also raised for meat and as a source of dairy products, including milk, butter, and cheese.\n", + "Alpaca fleece comes in 22 natural colors, the most\n" + ] + } + ], + "source": [ + "instruction = \"Tell me about alpacas.\"\n", + "\n", + "print(evaluate(instruction))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fdc7196e", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fdc7196e", + "outputId": "44cb6742-066b-470e-f507-cbf21e5ae030" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 6 ms, sys: 0 ns, total: 6 ms\n", + "Wall time: 5.86 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "model.set_adapter(\"portuguese_alpaca\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "31997da3", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "31997da3", + "outputId": "8071de75-dc9d-4e89-e85f-674f1de22658" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n", + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'm sorry, but I can't make it to the party. I'm not feeling well. I have a headache and I don't think it's a good idea for me to go out tonight. I hope you understand and that you have a great time at the party!\n" + ] + } + ], + "source": [ + "instruction = \"Invente uma desculpa criativa pra dizer que não preciso ir à festa.\"\n", + "\n", + "print(evaluate(instruction))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8b8e4e9a", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8b8e4e9a", + "outputId": "84226223-e018-4feb-e189-969c344fd940" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n", + "The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Eu não posso ir porque tenho que fazer uma tarefa para o meu professor.\n", + "\n" + ] + } + ], + "source": [ + "with model.disable_adapter():\n", + " instruction = \"Invente uma desculpa criativa pra dizer que não preciso ir à festa.\"\n", + "\n", + " print(evaluate(instruction))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/multi_adapter_examples/multi_adapter_weighted_inference_diffusers.ipynb b/peft/examples/multi_adapter_examples/multi_adapter_weighted_inference_diffusers.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fec4092f4e6d26a84f6bff0c9f0a736b54ff0614 --- /dev/null +++ b/peft/examples/multi_adapter_examples/multi_adapter_weighted_inference_diffusers.ipynb @@ -0,0 +1,12881 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook shows how to use the adapter merging methods from `peft` and apply them image generation models using `diffusers`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QaEZ3dPgGtza" + }, + "source": [ + "## Turn `diffusers` LoRA checkpoints into `PeftModel`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KwvBjN-e62ts", + "outputId": "19c58dc7-95db-49c6-beb8-3ade1a8fe284" + }, + "outputs": [], + "source": [ + "!pip install diffusers accelerate transformers -U -q\n", + "!pip install git+https://github.com/huggingface/peft -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5S64sUQhJqB3" + }, + "outputs": [], + "source": [ + "from google.colab import userdata\n", + "TOKEN = userdata.get(\"HF_TOKEN\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 150, + "referenced_widgets": [ + "c9f764b5036042af9f1505e3729cbc32", + "b999b3e3af3744a79c0c90657ef37d4e", + "f3baef4fbf4b4ec08480522be921f841", + "bd2740e191a74558a77a965b4f2d7f28", + "6f6a6cfd50404f1ea09f83e95b04550a", + "713dec1904ce46f6b2d5a9b7e3e0373a", + "a6bb8206de044c74a03d1a64c801e742", + "f2c67c29e1224df3b2def5a87eb8d368", + "486282a4ead148868005c592d74a4ed4", + "48f51c96b7574946bf9542633eb39135", + "35c810f4bfe741f091f172cede413950", + "67567eea233b423c8acd62773a4adb30", + "5161248cd0384d5887ed231ecd48c82e", + "6f72ea2c284e4e40899375c7b07c517f", + "d99a364420454ba5bfd510d1226b94af", + "d33c90e8ea0945d397659f6a90cf51b6", + "097fa44254ea4ecda7c8db995f370afc", + "9fb982789fbc4582bceb356e351db438", + "0204153b17ca4bb0b21fc033393ce9bd", + "f02041b1d5e1485bb2ba02b00fc2c242", + "4d327c9e91b34c7b84cedd8f9660e9fd", + "b44f7154c55146a3bf5f4bd9e438086f", + "b27ac2aaff694dd5999ab2cba91195da", + "556730e12d5d4e0ea51a0dd1b5aac331", + "584dd148b2344bdc92f1d0850399aed7", + "ccc5bdc185a84901994577ff7f1bc962", + "2c75632d8fdc4458814055172d1d72c3", + "2f016774e7854ef589442734d0bb2f08", + "a15c3a32bfd347a98c2a50d27cd5b9f9", + "67ef71b6521b47dd90e3dc0fd03016f2", + "989c54778eb7469fb91e4337d6f49b0b", + "439cf6c6f9e845cea4008e3219454ff4", + "460c3c96d1724713b78bddcfc1f3eb97" + ] + }, + "id": "1YH9xWDcyhaa", + "outputId": "8b7b32f4-4b77-411e-f499-7b2cf7650613" + }, + "outputs": [], + "source": [ + "from diffusers import UNet2DConditionModel\n", + "import torch\n", + "\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + "model_id = \"stabilityai/stable-diffusion-xl-base-1.0\"\n", + "unet = UNet2DConditionModel.from_pretrained(\n", + " model_id, subfolder=\"unet\", torch_dtype=torch.float16, use_safetensors=True, variant=\"fp16\"\n", + ").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "-kfTBaLR1Mp-" + }, + "outputs": [], + "source": [ + "# So that we can populate it later.\n", + "import copy\n", + "\n", + "sdxl_unet = copy.deepcopy(unet)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 593, + "referenced_widgets": [ + "0b7ffee735044ece90f010c6771d2d69", + "27671f87d0f3400b89e34bc428daa53c", + "33a018bf2ed547caa69f8ca91dcbb112", + "a91df5ceea8647d98f4d49701bb00969", + "c8555cb30ade40b9bd9a3435d6deeb67", + "b28a2b59aab9478e9ac3ab24a1de6f9a", + "c08fca70486040069d4f8f1df46a1074", + "bab2458ba3f54b229828ad9c8706aba4", + "2a0f746d6eab4680b6c44ec5cdbd8fd9", + "5269d405725946b3a657d3a8d7b25885", + "b4a697d8335c435ab21219716a1da022", + "85f22e78d47641d19efb6e6d62f6a014", + "d8bbb7402f3e44b2899fc98f02cee87e", + "8b41763280a048d485f06682ddc12ca2", + "05791b70a24a42138755141602399c47", + "753f64d9069640d985f399a058fc9b5b", + "19679186751b42e3ad2c44ea46c82a9c", + "d4baf7891f854c9c9898314633f97356", + "9c6ab017f5fb46eda8e3c22e9dd2b838", + "10bc970e474a46249e2a5e9e43fec7be", + "5c838962c7854541b61877c6d42481c4", + "4d54c07dd8e94557b82521772c1a2825", + "efd33121bdfc4ce195a07c9ef523a477", + "d2cd2572973249baa8e130b5777f4147", + "4e77d8b6cdb94fc68974d27b24cfb3fd", + "c292a598350d4dd4bb9b70aab1320c29", + "c68a5ad3bb664e8785e724085d208e96", + "b27c74f0c7bf493fa8bcb4c5b9c9c100", + "09638c8da74f4dddaf1d5d94dd8ad885", + "6a8a634cd3844fa081a4622d224ec940", + "2460b9e05b58481e898b139b94532c14", + "cdd8f9b1592842b48e1ffa80f8ec8246", + "ba9b002888b448738ba4b127e1046f5d", + "54ab158c643540abb8b3a96c1ae3ecab", + "55130e37444844f88286a87c8d153eba", + "7e3048e0fac94dfdaca1bc862ecfce15", + "643ba607008547ba9572fa4880a7d0f2", + "5738e5d103254c5485856f97d82954ab", + "beedc3b275c24432a2e959dcf9dd418d", + "10690b309908402289e9891203714199", + "0c68c6c65fb94650b15069a25d9e1699", + "ad233dfd52034b9e8c6c2c5b86995717", + "2ef25bd6dd644347baac12366d7002fd", + "0918b23f41e0404e82fccd08cadc6ccf", + "b25e822b3f77431fb72b4780067c90d9", + "55177afb435f44898df300143703e4d8", + "b79d3f5bd8024451bc7148ba2a5029bd", + "eacd646e2b984e60ab603bfc6d631de8", + "d158a80f4186411a8a9c335a8d5a888e", + "7049676db714446b98bba16b5f1b049e", + "09172ff4be4e433483d27b576464d1df", + "ea257c1c73524141b87ab3c1ef85c908", + "c474ed5e340146baae6c38f62013afe3", + "2ece04bc10934b3cb9d383abfc5ccd6e", + "67a03631ebc54f99928f0feb18ab38af", + "40ff502d2d5c40378c25cf84dd3323c8", + "230bd59922e84b90b0a141b3ac1e681b", + "6d0f946764444df28cb0da0fd0a408eb", + "c995197e66e04874a9f5d34db98b8890", + "f2596717405c40e1a39b721386e7a972", + "f8e6babf4fdd4c8e80d6dd24ff22d464", + "5e030e8a026b4513aa954203169f0a27", + "f9ca5d4810b34938b6f997ff66a8d541", + "b418ff1733db4efbab1b00b632b894e2", + "83f2d1dfaba54da38f0421b69930c3c1", + "577cc4b4f27941189c62951046db24ec", + "a16016987b6145b69caaac6712d72835", + "c82cf8cf90ed4e93bccffcf75881a56d", + "5a4c6dc09a1049c0adfca9834b045a25", + "c4d83c2b37504473afe63209a178b4cd", + "fae9d16daace412492b048b012b8d6dc", + "421b59f4021d4c4d930c51b6b4c7071f", + "60cc7415644d4e16b88f8fc5896b4b3b", + "fd13a58d6b444a0f955832647f64df12", + "8d492d53eb7f4225b516a65ef80f24e5", + "70849993c0c94ecf87c56e430f06181d", + "e820c557697648378966ed0a073826c8", + "a9ac3b2188594c19885bcdb9a659ecde", + "a708db5805424449afa269b714ebb3b3", + "6d492ddabcbe4d65b5d311834865ab92", + "e17e3253c16743a29f09b82d23c3b26d", + "e67a69c294334b01974f8bef36f133a0", + "24cbd9338ad94801aa11f4dcb2a867cb", + "c88fcf0d20154b1fb9b7e8a00116b5e6", + "ee8407365f5d42d9b98536152c9efe92", + "5ceb5c369f974b84ba850c9e81730a0e", + "a7767d5a440f4c819cdb87414f2187ff", + "f0bc6b14a299445ca705b888b3047064", + "fd0ff16b68b2488d8c31ebe700dee9c9", + "0e49d820da754da785bec2e5940eb9f6", + "0b53b908088648e3b2beadaeba0f5da1", + "804e7ee768794bba88aec3137f418868", + "0d786d8386ba49d0b53a5452d52e722d", + "33d459c0fbfe45389abe7eb43d2710a1", + "dd42a2ff90854b74ba5fde1de26b4e15", + "a3ce5829f8a640e79a19d737663b8474", + "62afb52a01924566b52f6c2d9ffb76f4", + "7a52c380a91c4f49bbfde658550248d3", + "3e418e46c47841ac9d717a6981807f68", + "8840288388ae4162884feef9c8e776f7", + "67b7783351d340d99c44149635b9be84", + "0f60c0f123954744ad13b670ca6dce77", + "dfabe7aa70024d1aa868ef5e6650dc6d", + "7d282fda276343c3aff99b001253ecc1", + "4cae160456c74a5fa761f026d64b2e35", + "819a26acd18f443882feb129f2c576d6", + "d2ed1988cf8c4bcdb1793b3d5068efed", + "6f113303101b4f448380a878d8900bf6", + "60e8e9f8e6ce40dd910ce1a9410b5e24", + "6703a0417c474db5bf261fe8679051e9", + "cb89ecd5a8c14051985495da1797a202", + "61db57127c5845759360bdf8b29dac2d", + "ab9c869439a94bddbbc0c6098f4c5b2a", + "587bd0b90afa450ba82e49cf86ee135d", + "955ec2551f8b400dbbdba68d7449de76", + "f46df85f441e4ada831f0e2b142f296a", + "d4f1dbe4ce244abc987b1089876e080f", + "13ac509e0d5f42e9bbb6deda62f77923", + "4a4a50a17c014f189b52119901104d79", + "cf42fc299989442f94f4a9df63005ab4", + "90b9300ba00e41e58170ec5634622985", + "4fa4f545258b4d6ca014a4c84ec4b24a", + "993e5303f107468f83dbf51026e64301", + "1e6c1c848e364ce4841ddcaa1383cfba", + "d432098a941c463599648b156abea24b", + "917139ad07a64e0cae89f2beeffae956", + "83afce4becac4f37ba916bf1901346ff", + "071e40d45ad14fc19b1480927d15d2ae", + "fe19dcea6d9a44d28f077e065f1671c4", + "08fddacdff4b4ae09adb5440fd86ae86", + "d62465d39d7e4265832901e9b9707993", + "4f56214f69034077bafdfdabc1c2aebf", + "29b67774c6944bc7ab93e7ba0eaf867f", + "0805d34df44f451d9b9910dbe5999245", + "2f2c0ea0fc914e7981e34d01751f74d8", + "5f52ae61812544f29f774f80fcb7a09c", + "ca5dc8342ef946a49d9e67a21f1a67c8", + "babce0e85baf4279ae1d22d64006667f", + "5cf6cd09cda64d688f4a0f7c511533d0", + "f668dd13af6f41d8be358f7db5261c54", + "8a8ef60b3b72452fb9ccc31052ab3b4f", + "7d29b20296aa4e33837b8ad53fc4adba", + "6a94d1d176b844db96de0c0e3cd67701", + "c80901426c87439481078b9da2e0c772", + "416f31db79fd431fb8dc06e994421b70", + "1971fcd35a564c449b4f437dac46058f", + "76a474e4aab14a5987cf25d1391d578c", + "ff3bf3f1873c4b01b0a547dfe02923ce", + "a6295c7e7630444c9b7425b884ad9707", + "3ede828a0374453e9aac9a6695befae3", + "dc4391fe30694a788134fffdd2a23d1a", + "cfc6ec59d45d42d187f42061902abbfb", + "68822f8f861a48859df630fad63f51c1", + "2671327d35e64f0da670a3a611fd0886", + "aae194e7f71f4eba80ffd18f91f083a8", + "9ad4192f6f244264aede1b3c8ac3c57a", + "90558769855c4bb08ff2fe4af940c45b", + "75384779c5d540d28afb53a0e318b674", + "a287f197e4fe4f908e3ee0a0e6cb35cf", + "fdb799739700447d8a5198f1f4f9b17f", + "683db0ff105a47038c350cfc74b88345", + "b79d32186443469d94836b663bf156b5", + "86cb336c1f684867bd13dae0370b4d36", + "7f63b3d80ecb4c3eb837bd0e616e1623", + "4f6e7b21ba0747f19b9d63468861c988", + "766567be45eb4db6b8d7a3364566c1dc", + "3c222a56d4404863a1dac60f1d03835b", + "db862ffbb44d450db514173df4c7f301", + "ddbeb13bb8174fc0b7d5543108d1c4f5", + "5b3bbc663d504fb99318ba186e6b8499", + "30bfac68f4224152b048d6ccf6013c5c", + "971f59d5f3e04697b3ab3c39ec0fc667", + "fe958df746be4dc1871bd58628697c3c", + "54417d8b9c5249d0a028d9e831dd8be6", + "9a37ffb9810f482b80f245f64947c371", + "bdfe4e4109a14a41bcd2e1e4242d82bb", + "22e894ba852e4072a1f63a83b3a98b16", + "95e24bbc8397455fabb724ed3c330511", + "d35b3848508c4b6390c243866649439d", + "c7cf10df8d7944aeb93947d5f4156c92", + "8e3b0b9f26a34ab8a932756b32834fd0", + "b1d427082b9f452eb6546c7d55016b36", + "061fcc6e5f3c44c48cde212c1ba515e5", + "d70508c304794bc79e80aab136eaf65a", + "01b20535e40b47068723073ac6c819ee", + "de0b54a59d9f47408d92915ad746cd5e", + "eb9a5f255fa0447eba6a33e1c30ba166", + "d1e4d2fd70e644f986104c998db7e53b", + "f786a0f386f6486083c15e576f6eb3e7", + "6715d91c6b62426aa49c344b65bcd8a2", + "0b0a85e1133c4ca3ba716e2403511703", + "017dc44dbe1c4de491e003a1e279a218", + "e3a1a5e9f29d4d28b0b9496493dafa21", + "c008577d922e436aabb8680ca0d13117", + "bd176c410a4e48a382a1b688e77aafcb", + "6b586ad7a3054c83b14243d69484127f", + "34fcd527a59748cf832c9efdb954fc0e", + "fd672cd5ba0c4695be4240707dcf4bf3" + ] + }, + "id": "EMTVH9cLEZYi", + "outputId": "7a24b4b0-71f6-4c65-a242-fb3d502da6a8" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 10.25it/s]\n" + ] + } + ], + "source": [ + "# Load the pipeline too.\n", + "from diffusers import DiffusionPipeline\n", + "\n", + "pipe = DiffusionPipeline.from_pretrained(\n", + " model_id, variant=\"fp16\", torch_dtype=torch.float16, unet=unet\n", + ").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "5e677518051c43768dbf06243701c817", + "d727913663634e368ade4a7dc64fe74b", + "eb73095c804a4272856fe348fa3cb1e9", + "7e9b46b10fa24dfea489dfbc150d2a2e", + "c8156b0cc68e4b3693dcabc530a4ea9a", + "d987907f09084d44b452f939aadff65e", + "e976b994189343f5ba7d762ef92c79e2", + "9810873713024e79ae6d338dfeae5876", + "2e18ce21c01a4a3ca992622957e7d297", + "2db944a049a04426bba181fddb2801b1", + "683863a313034025ab99fab5810f39c7" + ] + }, + "id": "D5hL5156zPis", + "outputId": "2510b8e7-c030-40f8-dcd2-ef76fc8529c6" + }, + "outputs": [], + "source": [ + "# Only UNet\n", + "pipe.load_lora_weights(\"CiroN2022/toy-face\", weight_name=\"toy_face_sdxl.safetensors\", adapter_name=\"toy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "p-7YWoOs02La" + }, + "outputs": [], + "source": [ + "from peft import get_peft_model, LoraConfig\n", + "\n", + "toy_peft_model = get_peft_model(\n", + " sdxl_unet,\n", + " pipe.unet.peft_config[\"toy\"],\n", + " adapter_name=\"toy\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 101, + "referenced_widgets": [ + "c6bdeef396174d51af9eee277752bec7", + "685797f8907c47ffab4fe7a81ca22e63", + "414f8301f76043758f69bbfb6960072d", + "23c0492f021a4b60b1d84b0b82d15378", + "550b3ad10fcb422eb66f71ad95988616", + "222da37b5af14d60814c65cdd1ea20be", + "6a2b5bbd4afe4e0cabe39e95ccf528be", + "7ba112dcece64386bdbac6837004891b", + "2ffbf400ff1f4cc9b358bb10c6b9d99f", + "5405d8a1aa1d411b895b0523fb8e4ce7", + "04846af1def142ffad328d434ba228fe" + ] + }, + "id": "a_2n4Odz2a0c", + "outputId": "4b3b801b-649f-4c69-b75c-f800ac75c17f" + }, + "outputs": [], + "source": [ + "original_state_dict = {f\"base_model.model.{k}\": v for k, v in pipe.unet.state_dict().items()}\n", + "\n", + "toy_peft_model.load_state_dict(original_state_dict, strict=True)\n", + "toy_peft_model.push_to_hub(\"toy_peft_model-new\", token=TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "z1DWL0X12rxD" + }, + "outputs": [], + "source": [ + "pipe.delete_adapters(\"toy\")\n", + "sdxl_unet.delete_adapters(\"toy\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49, + "referenced_widgets": [ + "d18c97fe685e4be080125ae770526255", + "e44c018bc76c48cd8738fee5966767ce", + "57b491647f1e49cea7ce34774a963936", + "484f694f734b4a92a14ecc4d048db0af", + "bcb3ec98d25b4c138c5e8f84c1e937c6", + "b0abce2d8a2046dba320e788e33e9d66", + "aef81ec1a7e844f883beba8c5754a8af", + "c040aa1f65514be28f0ca8ecdd1f69e4", + "eec76868d92f45d7a6db2e232a45e0c2", + "157c0c1e85ef40fb99e6cfe0e176be38", + "99717def9b6b4afe8a411a3bb83320c9" + ] + }, + "id": "9PW-SfwH5L7e", + "outputId": "90721ffa-faa5-4628-994a-7b719a4ef02c" + }, + "outputs": [], + "source": [ + "pipe.load_lora_weights(\"nerijs/pixel-art-xl\", weight_name=\"pixel-art-xl.safetensors\", adapter_name=\"pixel\")\n", + "pipe.set_adapters(adapter_names=\"pixel\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 101, + "referenced_widgets": [ + "adb2daf9d62f49f8ab1f0144b717d41e", + "bba38c266ceb4f30bb4bc1eaf5e3aa96", + "6926c8dd4e4e46d089bb387333691df7", + "abf2248c725b4837b5c2babef7f4ff3e", + "39e4bdf8d621451f984f6e7302fd6961", + "4abac6a83641414499f9b6b1514d1695", + "bf6e103b43844f17968eadf223a42acb", + "adf53d97af214cceaae895c8abfbd909", + "9d74948f7bcb498c9295e41e690a0a8d", + "d872c3900b8b4275b2224c9ec5e7d78f", + "4fbaa1bd51bf4b1e90337a435604d2bd" + ] + }, + "id": "jHSb-iIf7IEb", + "outputId": "29124d4c-b58f-4f0e-c59b-d79b44cb162f" + }, + "outputs": [], + "source": [ + "pixel_peft_model = get_peft_model(\n", + " sdxl_unet,\n", + " pipe.unet.peft_config[\"pixel\"],\n", + " adapter_name=\"pixel\"\n", + ")\n", + "\n", + "original_state_dict = {f\"base_model.model.{k}\": v for k, v in pipe.unet.state_dict().items()}\n", + "pixel_peft_model.load_state_dict(original_state_dict, strict=True)\n", + "pixel_peft_model.push_to_hub(\"pixel_peft_model-new\", token=TOKEN)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "yoPzMtyqG2ZO" + }, + "outputs": [], + "source": [ + "del pipe, sdxl_unet, toy_peft_model, pixel_peft_model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zis3zKZpGy8w" + }, + "source": [ + "## Weighted adapter inference" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "d74b1c667d42472b865ee1cbefc33a60", + "1074a274530e46c5b5a3e43653da43d0", + "dfde3984062442869bc8091bb94f2c36", + "a6f2ce8830734be4b8efe5ca0e14e990", + "07da49088e8d480fad03a2e828357872", + "5dd09616ba6544c499271054e8d8d2c5", + "fdb1fd279a0241429e5721ae2e92d217", + "7c6c4dff0a814bc6a7ac677980b45add", + "9a9da0d0e3d84a19b5d188c9bd6a83bb", + "de31002ed7fc475c915b4a29253108af", + "fbf3d268f30344b7864ce691d5bcb1f3", + "153a93d930df4ee396e03f1aaa6f04f1", + "c8e40d44aeac47a78f6502771f1471b7", + "270ddad8d7704f929a28c9fbbdabfa26", + "e57d317b3dda43bba13ecd4514f776d3", + "22b55b5ee1af4ea7b1acfe511b194cd8", + "a48d3ad24e9744f7898d1f2c5a696ea2", + "f29673e57d174839a0bde70bfa165715", + "13f5160dd981465890ada8a2cef22d5e", + "cc06ed7338b74ae6a1c563212aeb9f94", + "3a017f1d0ebf4a4aab57ac3eb0788774", + "48643ea67f2f4762bcd27de1d4cf0fd2", + "5c3d142907404cef8fd624839a166530", + "abaecfe7f39a43bb8fbd655d1a3009f4", + "d488374da5e74ec3a5973003590a7d69", + "5d8ae9661dce4a1aac8d418d84f3a209", + "32132ecbe71a4c2c903932513c2a1aa0", + "05700817adac4fdda6c95dd00eaeae38", + "5a7d87338ddc45df84080e0096a31631", + "cb9a536bc56f4d0ebf285e7f73d4730e", + "49bb475a11104e9496f2623e3d5caebd", + "887c5eae5b154eccb4a1caa3deef6e94", + "e87dffe17f1948e9ba794eddb605a908", + "2c352a90375443da835eef55b3e63303", + "db8fd6b2687c449fa0600d3e87c96999", + "17691a346ca5407a99a5e385450c97eb", + "0bca833a6aa74ebaa8f69feb738806bf", + "6bdb9b0b68c24b84a748c18ed927a8d3", + "355efc45ddaf42498d72d6134a28c87b", + "8b6464ce614c4aa29ac66ecce29b6cbf", + "bc07cdaad5b64fb3b0e1f8c214bba813", + "9e2e87c131a140a2a37dfdf483d27ced", + "47468a75637d436f849283c295a74ab6", + "af5003cf40ae4dfaa0660f247598856e" + ] + }, + "id": "gEqT1vFtG0_e", + "outputId": "282ce865-c653-4912-e497-ff825c896ae7" + }, + "outputs": [], + "source": [ + "from peft import PeftModel\n", + "\n", + "base_unet = UNet2DConditionModel.from_pretrained(\n", + " model_id, subfolder=\"unet\", torch_dtype=torch.float16, use_safetensors=True, variant=\"fp16\"\n", + ").to(device)\n", + "\n", + "toy_id = \"sayakpaul/toy_peft_model-new\"\n", + "model = PeftModel.from_pretrained(base_unet, toy_id, use_safetensors=True, subfolder=\"toy\", adapter_name=\"toy\")\n", + "model.load_adapter(\"sayakpaul/pixel_peft_model-new\", use_safetensors=True, subfolder=\"pixel\", adapter_name=\"pixel\")\n", + "\n", + "# https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraModel.add_weighted_adapter\n", + "model.add_weighted_adapter(\n", + " adapters=[\"toy\", \"pixel\"],\n", + " weights=[0.7, 0.3],\n", + " combination_type=\"linear\",\n", + " adapter_name=\"toy-pixel\"\n", + ")\n", + "model.set_adapters(\"toy-pixel\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 186 + }, + "id": "QStyurhKsP_g", + "outputId": "5e3a2627-27a2-4771-e62f-1b81ded2b87e" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "diffusers.models.unets.unet_2d_condition.UNet2DConditionModel" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(model.base_model.model)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "28a5059a2cc445d783e94ad5d83a0748", + "faddc146c69545cdaeb81edc8a0cda70", + "cd08b99de03c483a965248ff4df752ba", + "d37c6ac25fd34a65bf307896443a5063", + "66da596ae59d474f8a82a600174adff6", + "b600178b161d4a87a0b832a169d7caf2", + "b3eaa188cc2e48d081488eea4ed2971f", + "c05842ed12c848c68c4a69de9aa742a8", + "18e3ba4a61784f04b3238cf273c90c3e", + "42a623abc5d84c0eb7de4e5323bb6546", + "a65484354d254516936fcb425917a4b7", + "64029a1e70e040b49e38d38bd36823fd", + "3e1ec3a51e9b4fbbab489d34640cda90", + "ea0910fc31e44597968b2129272cc94d", + "3a0e9adc345f409cbcd79d1bd19219e6", + "3fe7e7f00ca746cc8cc762da6f365fde", + "6466eff2786241eeb142f17758894bb2", + "c181408b9b2b437ca11d584e3d1e94e7", + "0371aa5607604c06a868deb2a413cb31", + "c212598c5a8747f783a6efc18816e868", + "89ed71090d5c4366b21d25ad102e7da7", + "4ab21d7b956e46348ad6f6542fd92c2d" + ] + }, + "id": "iHwVV8f6s1EC", + "outputId": "47cb80da-266e-40c2-cfc1-3f3e5421b50b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 21.14it/s]\n", + "Expected types for unet: (,), got .\n", + "100%|██████████| 30/30 [00:09<00:00, 3.19it/s]\n" + ] + }, + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = model.to(dtype=torch.float16, device=device)\n", + "\n", + "pipe = DiffusionPipeline.from_pretrained(\n", + " model_id, unet=model, variant=\"fp16\", torch_dtype=torch.float16,\n", + ").to(device)\n", + "\n", + "prompt = \"toy_face of a hacker with a hoodie, pixel art\"\n", + "image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "adLnc7sMRZlq" + }, + "outputs": [], + "source": [ + "del pipe" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "nIwIQK5zRX25" + }, + "outputs": [], + "source": [ + "base_unet = UNet2DConditionModel.from_pretrained(\n", + " model_id, subfolder=\"unet\", torch_dtype=torch.float16, use_safetensors=True, variant=\"fp16\"\n", + ").to(device)\n", + "\n", + "toy_id = \"sayakpaul/toy_peft_model-new\"\n", + "model = PeftModel.from_pretrained(base_unet, toy_id, use_safetensors=True, subfolder=\"toy\", adapter_name=\"toy\")\n", + "model.load_adapter(\"sayakpaul/pixel_peft_model-new\", use_safetensors=True, subfolder=\"pixel\", adapter_name=\"pixel\")\n", + "\n", + "# https://huggingface.co/docs/peft/main/en/package_reference/lora#peft.LoraModel.add_weighted_adapter\n", + "model.add_weighted_adapter(\n", + " adapters=[\"toy\", \"pixel\"],\n", + " weights=[0.5, 0.5],\n", + " combination_type=\"cat\",\n", + " adapter_name=\"toy-pixel\"\n", + ")\n", + "model.set_adapters(\"toy-pixel\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "65bf3df199b44763aa223fee96889e17", + "a32666723b0e4b9883a78f56295c4356", + "c08373a044204308ac882dd8cf9cdd3e", + "fa7876ade8e240fc89a35a1f8c7c7d3c", + "de583920d3b54774a486aef4c052e50d", + "979c68bd2e224a40b939c27f32c25dac", + "0027f3aa006d4276983691ad985ce91b", + "a0f8a1a3512443ac84799b95da70ca26", + "e08d739f59874064994212363a307f6e", + "52c7e22284b0468c8bc0c3b1cad047fb", + "17c29bbcbc0c437c9c8bc83e0b085f1a", + "65b15618abfd4e6f9fafd64813e86ace", + "1e8747251a1f4cca970857911d1c4a98", + "caa46820018f47abab4a962afe51cc34", + "22a2ea45880f4b0da47f1b213882dcb0", + "0cf822f588244e54b5264176f9611164", + "dd9666d76af04b72b08f59023eb04ee3", + "2702528a2ca049fc800ad44c492690ae", + "aed53b6480de4dd4bc7463af04840952", + "447d29db05384c55a57c5fb1bd121af4", + "2d7e7e816a63428b8f30471a12b57bc4", + "e15aab8dd01f4d5582db80e6ad9931fc" + ] + }, + "id": "29iGITdnRhFG", + "outputId": "dc6a1e54-3f76-457e-da1b-e8677be5c31f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 19.85it/s]\n", + "Expected types for unet: (,), got .\n", + "100%|██████████| 30/30 [00:05<00:00, 5.35it/s]\n" + ] + }, + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = model.to(dtype=torch.float16, device=device)\n", + "\n", + "pipe = DiffusionPipeline.from_pretrained(\n", + " model_id, unet=model, variant=\"fp16\", torch_dtype=torch.float16,\n", + ").to(device)\n", + "\n", + "prompt = \"toy_face of a hacker with a hoodie, pixel art\"\n", + "image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]\n", + "image" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "8b88a1a63cf242de8a68962f50498c72", + "777c5583820248838f0c39e82362d9e3", + "491da86a94c44b7c87366b4bb72a3bd1", + "31c3a375a2964a06a2473925fe9b197b", + "0ba0a0fca31c482bb628a6739d341601", + "734e9a834ff74b64b17453013208a116", + "9784d9210d6e4214b78ab5f8c33e8044", + "490be826ebc14d5ab4e9f0e10ec79d5f", + "64ea3c5cc12841f59d1acba12deb6a88", + "fc1391aaeaad4eecad967e800a669ec1", + "272fceb4e9484b389067080872b1abf9", + "5ab5097f19cf4474945f96741c444d71", + "6d8fdd0303774305ae20dd39e2a1706c", + "031a326124f1496abe1f3bf8de720029", + "b9feee6f48bd49209e72e5c5e3136f67", + "a2d497a2ddb04d8fac8a4c0f8aa7f5dc", + "7bb3f4cda33947138c61aac74d952289", + "d3b73a841e68425994632d0f05cf4f16", + "71ad1e3dbe44437d8df985cfae207dcd", + "598fa824f517445394d08c37393f9f3d", + "daad0d12aff8470d990fbbbbe19d5891", + "16e18e872ab64b3f8bfb32f580c3371b" + ] + }, + "id": "sQOnSrteuS-S", + "outputId": "44c5e61a-370b-44bf-a5e5-80b7787088e5" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 14.10it/s]\n", + "100%|██████████| 30/30 [00:03<00:00, 9.26it/s]\n" + ] + }, + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del pipe\n", + "\n", + "pipe = DiffusionPipeline.from_pretrained(\n", + " model_id, variant=\"fp16\", torch_dtype=torch.float16,\n", + ").to(device)\n", + "\n", + "prompt = \"toy_face of a hacker with a hoodie, pixel art\"\n", + "image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]\n", + "image" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0027f3aa006d4276983691ad985ce91b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "017dc44dbe1c4de491e003a1e279a218": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "01b20535e40b47068723073ac6c819ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0204153b17ca4bb0b21fc033393ce9bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "031a326124f1496abe1f3bf8de720029": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_71ad1e3dbe44437d8df985cfae207dcd", + "max": 30, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_598fa824f517445394d08c37393f9f3d", + "value": 30 + } + }, + "0371aa5607604c06a868deb2a413cb31": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "04846af1def142ffad328d434ba228fe": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "05700817adac4fdda6c95dd00eaeae38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "05791b70a24a42138755141602399c47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5c838962c7854541b61877c6d42481c4", + "placeholder": "​", + "style": "IPY_MODEL_4d54c07dd8e94557b82521772c1a2825", + "value": " 17/17 [01:04<00:00, 14.56s/it]" + } + }, + "061fcc6e5f3c44c48cde212c1ba515e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "071e40d45ad14fc19b1480927d15d2ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "07da49088e8d480fad03a2e828357872": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0805d34df44f451d9b9910dbe5999245": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_babce0e85baf4279ae1d22d64006667f", + "placeholder": "​", + "style": "IPY_MODEL_5cf6cd09cda64d688f4a0f7c511533d0", + "value": "vae/config.json: 100%" + } + }, + "08fddacdff4b4ae09adb5440fd86ae86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "09172ff4be4e433483d27b576464d1df": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0918b23f41e0404e82fccd08cadc6ccf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "09638c8da74f4dddaf1d5d94dd8ad885": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "097fa44254ea4ecda7c8db995f370afc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0b0a85e1133c4ca3ba716e2403511703": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_34fcd527a59748cf832c9efdb954fc0e", + "placeholder": "​", + "style": "IPY_MODEL_fd672cd5ba0c4695be4240707dcf4bf3", + "value": " 7/7 [00:01<00:00,  9.61it/s]" + } + }, + "0b53b908088648e3b2beadaeba0f5da1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a3ce5829f8a640e79a19d737663b8474", + "max": 565, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_62afb52a01924566b52f6c2d9ffb76f4", + "value": 565 + } + }, + "0b7ffee735044ece90f010c6771d2d69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_27671f87d0f3400b89e34bc428daa53c", + "IPY_MODEL_33a018bf2ed547caa69f8ca91dcbb112", + "IPY_MODEL_a91df5ceea8647d98f4d49701bb00969" + ], + "layout": "IPY_MODEL_c8555cb30ade40b9bd9a3435d6deeb67" + } + }, + "0ba0a0fca31c482bb628a6739d341601": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0bca833a6aa74ebaa8f69feb738806bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47468a75637d436f849283c295a74ab6", + "placeholder": "​", + "style": "IPY_MODEL_af5003cf40ae4dfaa0660f247598856e", + "value": " 170M/170M [00:07<00:00, 22.4MB/s]" + } + }, + "0c68c6c65fb94650b15069a25d9e1699": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0cf822f588244e54b5264176f9611164": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0d786d8386ba49d0b53a5452d52e722d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0e49d820da754da785bec2e5940eb9f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_33d459c0fbfe45389abe7eb43d2710a1", + "placeholder": "​", + "style": "IPY_MODEL_dd42a2ff90854b74ba5fde1de26b4e15", + "value": "text_encoder/config.json: 100%" + } + }, + "0f60c0f123954744ad13b670ca6dce77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d2ed1988cf8c4bcdb1793b3d5068efed", + "max": 1059962, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6f113303101b4f448380a878d8900bf6", + "value": 1059962 + } + }, + "10690b309908402289e9891203714199": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1074a274530e46c5b5a3e43653da43d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5dd09616ba6544c499271054e8d8d2c5", + "placeholder": "​", + "style": "IPY_MODEL_fdb1fd279a0241429e5721ae2e92d217", + "value": "toy/adapter_config.json: 100%" + } + }, + "10bc970e474a46249e2a5e9e43fec7be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "13ac509e0d5f42e9bbb6deda62f77923": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "13f5160dd981465890ada8a2cef22d5e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "153a93d930df4ee396e03f1aaa6f04f1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c8e40d44aeac47a78f6502771f1471b7", + "IPY_MODEL_270ddad8d7704f929a28c9fbbdabfa26", + "IPY_MODEL_e57d317b3dda43bba13ecd4514f776d3" + ], + "layout": "IPY_MODEL_22b55b5ee1af4ea7b1acfe511b194cd8" + } + }, + "157c0c1e85ef40fb99e6cfe0e176be38": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "16e18e872ab64b3f8bfb32f580c3371b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "17691a346ca5407a99a5e385450c97eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bc07cdaad5b64fb3b0e1f8c214bba813", + "max": 170461008, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9e2e87c131a140a2a37dfdf483d27ced", + "value": 170461008 + } + }, + "17c29bbcbc0c437c9c8bc83e0b085f1a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "18e3ba4a61784f04b3238cf273c90c3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "19679186751b42e3ad2c44ea46c82a9c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1971fcd35a564c449b4f437dac46058f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dc4391fe30694a788134fffdd2a23d1a", + "max": 1389382176, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cfc6ec59d45d42d187f42061902abbfb", + "value": 1389382176 + } + }, + "1e6c1c848e364ce4841ddcaa1383cfba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fe19dcea6d9a44d28f077e065f1671c4", + "max": 460, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_08fddacdff4b4ae09adb5440fd86ae86", + "value": 460 + } + }, + "1e8747251a1f4cca970857911d1c4a98": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dd9666d76af04b72b08f59023eb04ee3", + "placeholder": "​", + "style": "IPY_MODEL_2702528a2ca049fc800ad44c492690ae", + "value": "100%" + } + }, + "222da37b5af14d60814c65cdd1ea20be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "22a2ea45880f4b0da47f1b213882dcb0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2d7e7e816a63428b8f30471a12b57bc4", + "placeholder": "​", + "style": "IPY_MODEL_e15aab8dd01f4d5582db80e6ad9931fc", + "value": " 30/30 [00:07<00:00,  4.07it/s]" + } + }, + "22b55b5ee1af4ea7b1acfe511b194cd8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "22e894ba852e4072a1f63a83b3a98b16": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_95e24bbc8397455fabb724ed3c330511", + "IPY_MODEL_d35b3848508c4b6390c243866649439d", + "IPY_MODEL_c7cf10df8d7944aeb93947d5f4156c92" + ], + "layout": "IPY_MODEL_8e3b0b9f26a34ab8a932756b32834fd0" + } + }, + "230bd59922e84b90b0a141b3ac1e681b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f8e6babf4fdd4c8e80d6dd24ff22d464", + "placeholder": "​", + "style": "IPY_MODEL_5e030e8a026b4513aa954203169f0a27", + "value": "text_encoder_2/config.json: 100%" + } + }, + "23c0492f021a4b60b1d84b0b82d15378": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5405d8a1aa1d411b895b0523fb8e4ce7", + "placeholder": "​", + "style": "IPY_MODEL_04846af1def142ffad328d434ba228fe", + "value": " 5.12k/5.12k [00:00<00:00, 358kB/s]" + } + }, + "2460b9e05b58481e898b139b94532c14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "24cbd9338ad94801aa11f4dcb2a867cb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2671327d35e64f0da670a3a611fd0886": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2702528a2ca049fc800ad44c492690ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "270ddad8d7704f929a28c9fbbdabfa26": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_13f5160dd981465890ada8a2cef22d5e", + "max": 170461008, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cc06ed7338b74ae6a1c563212aeb9f94", + "value": 170461008 + } + }, + "272fceb4e9484b389067080872b1abf9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "27671f87d0f3400b89e34bc428daa53c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b28a2b59aab9478e9ac3ab24a1de6f9a", + "placeholder": "​", + "style": "IPY_MODEL_c08fca70486040069d4f8f1df46a1074", + "value": "model_index.json: 100%" + } + }, + "28a5059a2cc445d783e94ad5d83a0748": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_faddc146c69545cdaeb81edc8a0cda70", + "IPY_MODEL_cd08b99de03c483a965248ff4df752ba", + "IPY_MODEL_d37c6ac25fd34a65bf307896443a5063" + ], + "layout": "IPY_MODEL_66da596ae59d474f8a82a600174adff6" + } + }, + "29b67774c6944bc7ab93e7ba0eaf867f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0805d34df44f451d9b9910dbe5999245", + "IPY_MODEL_2f2c0ea0fc914e7981e34d01751f74d8", + "IPY_MODEL_5f52ae61812544f29f774f80fcb7a09c" + ], + "layout": "IPY_MODEL_ca5dc8342ef946a49d9e67a21f1a67c8" + } + }, + "2a0f746d6eab4680b6c44ec5cdbd8fd9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2c352a90375443da835eef55b3e63303": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_db8fd6b2687c449fa0600d3e87c96999", + "IPY_MODEL_17691a346ca5407a99a5e385450c97eb", + "IPY_MODEL_0bca833a6aa74ebaa8f69feb738806bf" + ], + "layout": "IPY_MODEL_6bdb9b0b68c24b84a748c18ed927a8d3" + } + }, + "2c75632d8fdc4458814055172d1d72c3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d7e7e816a63428b8f30471a12b57bc4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2db944a049a04426bba181fddb2801b1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2e18ce21c01a4a3ca992622957e7d297": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ece04bc10934b3cb9d383abfc5ccd6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2ef25bd6dd644347baac12366d7002fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f016774e7854ef589442734d0bb2f08": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2f2c0ea0fc914e7981e34d01751f74d8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f668dd13af6f41d8be358f7db5261c54", + "max": 642, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8a8ef60b3b72452fb9ccc31052ab3b4f", + "value": 642 + } + }, + "2ffbf400ff1f4cc9b358bb10c6b9d99f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "30bfac68f4224152b048d6ccf6013c5c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "31c3a375a2964a06a2473925fe9b197b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fc1391aaeaad4eecad967e800a669ec1", + "placeholder": "​", + "style": "IPY_MODEL_272fceb4e9484b389067080872b1abf9", + "value": " 7/7 [00:01<00:00,  4.28it/s]" + } + }, + "32132ecbe71a4c2c903932513c2a1aa0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "33a018bf2ed547caa69f8ca91dcbb112": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bab2458ba3f54b229828ad9c8706aba4", + "max": 609, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2a0f746d6eab4680b6c44ec5cdbd8fd9", + "value": 609 + } + }, + "33d459c0fbfe45389abe7eb43d2710a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34fcd527a59748cf832c9efdb954fc0e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "355efc45ddaf42498d72d6134a28c87b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35c810f4bfe741f091f172cede413950": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "39e4bdf8d621451f984f6e7302fd6961": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a017f1d0ebf4a4aab57ac3eb0788774": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a0e9adc345f409cbcd79d1bd19219e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_89ed71090d5c4366b21d25ad102e7da7", + "placeholder": "​", + "style": "IPY_MODEL_4ab21d7b956e46348ad6f6542fd92c2d", + "value": " 30/30 [00:08<00:00,  4.13it/s]" + } + }, + "3c222a56d4404863a1dac60f1d03835b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_30bfac68f4224152b048d6ccf6013c5c", + "placeholder": "​", + "style": "IPY_MODEL_971f59d5f3e04697b3ab3c39ec0fc667", + "value": "tokenizer_2/vocab.json: 100%" + } + }, + "3e1ec3a51e9b4fbbab489d34640cda90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6466eff2786241eeb142f17758894bb2", + "placeholder": "​", + "style": "IPY_MODEL_c181408b9b2b437ca11d584e3d1e94e7", + "value": "100%" + } + }, + "3e418e46c47841ac9d717a6981807f68": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ede828a0374453e9aac9a6695befae3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3fe7e7f00ca746cc8cc762da6f365fde": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "40ff502d2d5c40378c25cf84dd3323c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_230bd59922e84b90b0a141b3ac1e681b", + "IPY_MODEL_6d0f946764444df28cb0da0fd0a408eb", + "IPY_MODEL_c995197e66e04874a9f5d34db98b8890" + ], + "layout": "IPY_MODEL_f2596717405c40e1a39b721386e7a972" + } + }, + "414f8301f76043758f69bbfb6960072d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ba112dcece64386bdbac6837004891b", + "max": 5116, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2ffbf400ff1f4cc9b358bb10c6b9d99f", + "value": 5116 + } + }, + "416f31db79fd431fb8dc06e994421b70": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a6295c7e7630444c9b7425b884ad9707", + "placeholder": "​", + "style": "IPY_MODEL_3ede828a0374453e9aac9a6695befae3", + "value": "model.fp16.safetensors: 100%" + } + }, + "421b59f4021d4c4d930c51b6b4c7071f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "42a623abc5d84c0eb7de4e5323bb6546": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "439cf6c6f9e845cea4008e3219454ff4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "447d29db05384c55a57c5fb1bd121af4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "460c3c96d1724713b78bddcfc1f3eb97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "47468a75637d436f849283c295a74ab6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "484f694f734b4a92a14ecc4d048db0af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_157c0c1e85ef40fb99e6cfe0e176be38", + "placeholder": "​", + "style": "IPY_MODEL_99717def9b6b4afe8a411a3bb83320c9", + "value": " 171M/171M [00:00<00:00, 331MB/s]" + } + }, + "486282a4ead148868005c592d74a4ed4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "48643ea67f2f4762bcd27de1d4cf0fd2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "48f51c96b7574946bf9542633eb39135": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "490be826ebc14d5ab4e9f0e10ec79d5f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "491da86a94c44b7c87366b4bb72a3bd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_490be826ebc14d5ab4e9f0e10ec79d5f", + "max": 7, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_64ea3c5cc12841f59d1acba12deb6a88", + "value": 7 + } + }, + "49bb475a11104e9496f2623e3d5caebd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4a4a50a17c014f189b52119901104d79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4ab21d7b956e46348ad6f6542fd92c2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4abac6a83641414499f9b6b1514d1695": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4cae160456c74a5fa761f026d64b2e35": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4d327c9e91b34c7b84cedd8f9660e9fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4d54c07dd8e94557b82521772c1a2825": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4e77d8b6cdb94fc68974d27b24cfb3fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6a8a634cd3844fa081a4622d224ec940", + "max": 246144152, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2460b9e05b58481e898b139b94532c14", + "value": 246144152 + } + }, + "4f56214f69034077bafdfdabc1c2aebf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4f6e7b21ba0747f19b9d63468861c988": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4fa4f545258b4d6ca014a4c84ec4b24a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_993e5303f107468f83dbf51026e64301", + "IPY_MODEL_1e6c1c848e364ce4841ddcaa1383cfba", + "IPY_MODEL_d432098a941c463599648b156abea24b" + ], + "layout": "IPY_MODEL_917139ad07a64e0cae89f2beeffae956" + } + }, + "4fbaa1bd51bf4b1e90337a435604d2bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5161248cd0384d5887ed231ecd48c82e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_097fa44254ea4ecda7c8db995f370afc", + "placeholder": "​", + "style": "IPY_MODEL_9fb982789fbc4582bceb356e351db438", + "value": "unet/config.json: 100%" + } + }, + "5269d405725946b3a657d3a8d7b25885": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52c7e22284b0468c8bc0c3b1cad047fb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5405d8a1aa1d411b895b0523fb8e4ce7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "54417d8b9c5249d0a028d9e831dd8be6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "54ab158c643540abb8b3a96c1ae3ecab": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_55130e37444844f88286a87c8d153eba", + "IPY_MODEL_7e3048e0fac94dfdaca1bc862ecfce15", + "IPY_MODEL_643ba607008547ba9572fa4880a7d0f2" + ], + "layout": "IPY_MODEL_5738e5d103254c5485856f97d82954ab" + } + }, + "550b3ad10fcb422eb66f71ad95988616": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "55130e37444844f88286a87c8d153eba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_beedc3b275c24432a2e959dcf9dd418d", + "placeholder": "​", + "style": "IPY_MODEL_10690b309908402289e9891203714199", + "value": "scheduler/scheduler_config.json: 100%" + } + }, + "55177afb435f44898df300143703e4d8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7049676db714446b98bba16b5f1b049e", + "placeholder": "​", + "style": "IPY_MODEL_09172ff4be4e433483d27b576464d1df", + "value": "tokenizer/tokenizer_config.json: 100%" + } + }, + "556730e12d5d4e0ea51a0dd1b5aac331": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2f016774e7854ef589442734d0bb2f08", + "placeholder": "​", + "style": "IPY_MODEL_a15c3a32bfd347a98c2a50d27cd5b9f9", + "value": "diffusion_pytorch_model.fp16.safetensors: 100%" + } + }, + "5738e5d103254c5485856f97d82954ab": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "577cc4b4f27941189c62951046db24ec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "57b491647f1e49cea7ce34774a963936": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c040aa1f65514be28f0ca8ecdd1f69e4", + "max": 170543052, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eec76868d92f45d7a6db2e232a45e0c2", + "value": 170543052 + } + }, + "584dd148b2344bdc92f1d0850399aed7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_67ef71b6521b47dd90e3dc0fd03016f2", + "max": 5135149760, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_989c54778eb7469fb91e4337d6f49b0b", + "value": 5135149760 + } + }, + "587bd0b90afa450ba82e49cf86ee135d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cf42fc299989442f94f4a9df63005ab4", + "placeholder": "​", + "style": "IPY_MODEL_90b9300ba00e41e58170ec5634622985", + "value": " 725/725 [00:00<00:00, 55.6kB/s]" + } + }, + "598fa824f517445394d08c37393f9f3d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5a4c6dc09a1049c0adfca9834b045a25": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fd13a58d6b444a0f955832647f64df12", + "max": 472, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8d492d53eb7f4225b516a65ef80f24e5", + "value": 472 + } + }, + "5a7d87338ddc45df84080e0096a31631": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5ab5097f19cf4474945f96741c444d71": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6d8fdd0303774305ae20dd39e2a1706c", + "IPY_MODEL_031a326124f1496abe1f3bf8de720029", + "IPY_MODEL_b9feee6f48bd49209e72e5c5e3136f67" + ], + "layout": "IPY_MODEL_a2d497a2ddb04d8fac8a4c0f8aa7f5dc" + } + }, + "5b3bbc663d504fb99318ba186e6b8499": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5c3d142907404cef8fd624839a166530": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_abaecfe7f39a43bb8fbd655d1a3009f4", + "IPY_MODEL_d488374da5e74ec3a5973003590a7d69", + "IPY_MODEL_5d8ae9661dce4a1aac8d418d84f3a209" + ], + "layout": "IPY_MODEL_32132ecbe71a4c2c903932513c2a1aa0" + } + }, + "5c838962c7854541b61877c6d42481c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5ceb5c369f974b84ba850c9e81730a0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5cf6cd09cda64d688f4a0f7c511533d0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5d8ae9661dce4a1aac8d418d84f3a209": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_887c5eae5b154eccb4a1caa3deef6e94", + "placeholder": "​", + "style": "IPY_MODEL_e87dffe17f1948e9ba794eddb605a908", + "value": " 47.3k/47.3k [00:00<00:00, 3.39MB/s]" + } + }, + "5dd09616ba6544c499271054e8d8d2c5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e030e8a026b4513aa954203169f0a27": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5e677518051c43768dbf06243701c817": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d727913663634e368ade4a7dc64fe74b", + "IPY_MODEL_eb73095c804a4272856fe348fa3cb1e9", + "IPY_MODEL_7e9b46b10fa24dfea489dfbc150d2a2e" + ], + "layout": "IPY_MODEL_c8156b0cc68e4b3693dcabc530a4ea9a" + } + }, + "5f52ae61812544f29f774f80fcb7a09c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7d29b20296aa4e33837b8ad53fc4adba", + "placeholder": "​", + "style": "IPY_MODEL_6a94d1d176b844db96de0c0e3cd67701", + "value": " 642/642 [00:00<00:00, 35.8kB/s]" + } + }, + "60cc7415644d4e16b88f8fc5896b4b3b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "60e8e9f8e6ce40dd910ce1a9410b5e24": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "61db57127c5845759360bdf8b29dac2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f46df85f441e4ada831f0e2b142f296a", + "placeholder": "​", + "style": "IPY_MODEL_d4f1dbe4ce244abc987b1089876e080f", + "value": "tokenizer_2/tokenizer_config.json: 100%" + } + }, + "62afb52a01924566b52f6c2d9ffb76f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "64029a1e70e040b49e38d38bd36823fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3e1ec3a51e9b4fbbab489d34640cda90", + "IPY_MODEL_ea0910fc31e44597968b2129272cc94d", + "IPY_MODEL_3a0e9adc345f409cbcd79d1bd19219e6" + ], + "layout": "IPY_MODEL_3fe7e7f00ca746cc8cc762da6f365fde" + } + }, + "643ba607008547ba9572fa4880a7d0f2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ef25bd6dd644347baac12366d7002fd", + "placeholder": "​", + "style": "IPY_MODEL_0918b23f41e0404e82fccd08cadc6ccf", + "value": " 479/479 [00:00<00:00, 40.5kB/s]" + } + }, + "6466eff2786241eeb142f17758894bb2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "64ea3c5cc12841f59d1acba12deb6a88": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "65b15618abfd4e6f9fafd64813e86ace": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1e8747251a1f4cca970857911d1c4a98", + "IPY_MODEL_caa46820018f47abab4a962afe51cc34", + "IPY_MODEL_22a2ea45880f4b0da47f1b213882dcb0" + ], + "layout": "IPY_MODEL_0cf822f588244e54b5264176f9611164" + } + }, + "65bf3df199b44763aa223fee96889e17": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a32666723b0e4b9883a78f56295c4356", + "IPY_MODEL_c08373a044204308ac882dd8cf9cdd3e", + "IPY_MODEL_fa7876ade8e240fc89a35a1f8c7c7d3c" + ], + "layout": "IPY_MODEL_de583920d3b54774a486aef4c052e50d" + } + }, + "66da596ae59d474f8a82a600174adff6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6703a0417c474db5bf261fe8679051e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6715d91c6b62426aa49c344b65bcd8a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bd176c410a4e48a382a1b688e77aafcb", + "max": 7, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6b586ad7a3054c83b14243d69484127f", + "value": 7 + } + }, + "67567eea233b423c8acd62773a4adb30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5161248cd0384d5887ed231ecd48c82e", + "IPY_MODEL_6f72ea2c284e4e40899375c7b07c517f", + "IPY_MODEL_d99a364420454ba5bfd510d1226b94af" + ], + "layout": "IPY_MODEL_d33c90e8ea0945d397659f6a90cf51b6" + } + }, + "67a03631ebc54f99928f0feb18ab38af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "67b7783351d340d99c44149635b9be84": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4cae160456c74a5fa761f026d64b2e35", + "placeholder": "​", + "style": "IPY_MODEL_819a26acd18f443882feb129f2c576d6", + "value": "tokenizer/vocab.json: 100%" + } + }, + "67ef71b6521b47dd90e3dc0fd03016f2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "683863a313034025ab99fab5810f39c7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "683db0ff105a47038c350cfc74b88345": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "685797f8907c47ffab4fe7a81ca22e63": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_222da37b5af14d60814c65cdd1ea20be", + "placeholder": "​", + "style": "IPY_MODEL_6a2b5bbd4afe4e0cabe39e95ccf528be", + "value": "README.md: 100%" + } + }, + "68822f8f861a48859df630fad63f51c1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6926c8dd4e4e46d089bb387333691df7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_adf53d97af214cceaae895c8abfbd909", + "max": 170461008, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9d74948f7bcb498c9295e41e690a0a8d", + "value": 170461008 + } + }, + "6a2b5bbd4afe4e0cabe39e95ccf528be": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6a8a634cd3844fa081a4622d224ec940": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a94d1d176b844db96de0c0e3cd67701": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6b586ad7a3054c83b14243d69484127f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6bdb9b0b68c24b84a748c18ed927a8d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6d0f946764444df28cb0da0fd0a408eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f9ca5d4810b34938b6f997ff66a8d541", + "max": 575, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b418ff1733db4efbab1b00b632b894e2", + "value": 575 + } + }, + "6d492ddabcbe4d65b5d311834865ab92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8407365f5d42d9b98536152c9efe92", + "max": 524619, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5ceb5c369f974b84ba850c9e81730a0e", + "value": 524619 + } + }, + "6d8fdd0303774305ae20dd39e2a1706c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7bb3f4cda33947138c61aac74d952289", + "placeholder": "​", + "style": "IPY_MODEL_d3b73a841e68425994632d0f05cf4f16", + "value": "100%" + } + }, + "6f113303101b4f448380a878d8900bf6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6f6a6cfd50404f1ea09f83e95b04550a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f72ea2c284e4e40899375c7b07c517f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0204153b17ca4bb0b21fc033393ce9bd", + "max": 1680, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f02041b1d5e1485bb2ba02b00fc2c242", + "value": 1680 + } + }, + "7049676db714446b98bba16b5f1b049e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "70849993c0c94ecf87c56e430f06181d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "713dec1904ce46f6b2d5a9b7e3e0373a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "71ad1e3dbe44437d8df985cfae207dcd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "734e9a834ff74b64b17453013208a116": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "75384779c5d540d28afb53a0e318b674": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7f63b3d80ecb4c3eb837bd0e616e1623", + "placeholder": "​", + "style": "IPY_MODEL_4f6e7b21ba0747f19b9d63468861c988", + "value": " 167M/167M [00:07<00:00, 24.0MB/s]" + } + }, + "753f64d9069640d985f399a058fc9b5b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "766567be45eb4db6b8d7a3364566c1dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3c222a56d4404863a1dac60f1d03835b", + "IPY_MODEL_db862ffbb44d450db514173df4c7f301", + "IPY_MODEL_ddbeb13bb8174fc0b7d5543108d1c4f5" + ], + "layout": "IPY_MODEL_5b3bbc663d504fb99318ba186e6b8499" + } + }, + "76a474e4aab14a5987cf25d1391d578c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_68822f8f861a48859df630fad63f51c1", + "placeholder": "​", + "style": "IPY_MODEL_2671327d35e64f0da670a3a611fd0886", + "value": " 1.39G/1.39G [01:02<00:00, 24.3MB/s]" + } + }, + "777c5583820248838f0c39e82362d9e3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_734e9a834ff74b64b17453013208a116", + "placeholder": "​", + "style": "IPY_MODEL_9784d9210d6e4214b78ab5f8c33e8044", + "value": "Loading pipeline components...: 100%" + } + }, + "7a52c380a91c4f49bbfde658550248d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ba112dcece64386bdbac6837004891b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bb3f4cda33947138c61aac74d952289": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c6c4dff0a814bc6a7ac677980b45add": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d282fda276343c3aff99b001253ecc1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7d29b20296aa4e33837b8ad53fc4adba": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7e3048e0fac94dfdaca1bc862ecfce15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0c68c6c65fb94650b15069a25d9e1699", + "max": 479, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ad233dfd52034b9e8c6c2c5b86995717", + "value": 479 + } + }, + "7e9b46b10fa24dfea489dfbc150d2a2e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2db944a049a04426bba181fddb2801b1", + "placeholder": "​", + "style": "IPY_MODEL_683863a313034025ab99fab5810f39c7", + "value": " 171M/171M [00:00<00:00, 302MB/s]" + } + }, + "7f63b3d80ecb4c3eb837bd0e616e1623": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "804e7ee768794bba88aec3137f418868": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a52c380a91c4f49bbfde658550248d3", + "placeholder": "​", + "style": "IPY_MODEL_3e418e46c47841ac9d717a6981807f68", + "value": " 565/565 [00:00<00:00, 16.6kB/s]" + } + }, + "819a26acd18f443882feb129f2c576d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "83afce4becac4f37ba916bf1901346ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "83f2d1dfaba54da38f0421b69930c3c1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "85f22e78d47641d19efb6e6d62f6a014": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d8bbb7402f3e44b2899fc98f02cee87e", + "IPY_MODEL_8b41763280a048d485f06682ddc12ca2", + "IPY_MODEL_05791b70a24a42138755141602399c47" + ], + "layout": "IPY_MODEL_753f64d9069640d985f399a058fc9b5b" + } + }, + "86cb336c1f684867bd13dae0370b4d36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8840288388ae4162884feef9c8e776f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_67b7783351d340d99c44149635b9be84", + "IPY_MODEL_0f60c0f123954744ad13b670ca6dce77", + "IPY_MODEL_dfabe7aa70024d1aa868ef5e6650dc6d" + ], + "layout": "IPY_MODEL_7d282fda276343c3aff99b001253ecc1" + } + }, + "887c5eae5b154eccb4a1caa3deef6e94": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "89ed71090d5c4366b21d25ad102e7da7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8a8ef60b3b72452fb9ccc31052ab3b4f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8b41763280a048d485f06682ddc12ca2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c6ab017f5fb46eda8e3c22e9dd2b838", + "max": 17, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_10bc970e474a46249e2a5e9e43fec7be", + "value": 17 + } + }, + "8b6464ce614c4aa29ac66ecce29b6cbf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8b88a1a63cf242de8a68962f50498c72": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_777c5583820248838f0c39e82362d9e3", + "IPY_MODEL_491da86a94c44b7c87366b4bb72a3bd1", + "IPY_MODEL_31c3a375a2964a06a2473925fe9b197b" + ], + "layout": "IPY_MODEL_0ba0a0fca31c482bb628a6739d341601" + } + }, + "8d492d53eb7f4225b516a65ef80f24e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "8e3b0b9f26a34ab8a932756b32834fd0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "90558769855c4bb08ff2fe4af940c45b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b79d32186443469d94836b663bf156b5", + "max": 167335342, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_86cb336c1f684867bd13dae0370b4d36", + "value": 167335342 + } + }, + "90b9300ba00e41e58170ec5634622985": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "917139ad07a64e0cae89f2beeffae956": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "955ec2551f8b400dbbdba68d7449de76": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "95e24bbc8397455fabb724ed3c330511": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b1d427082b9f452eb6546c7d55016b36", + "placeholder": "​", + "style": "IPY_MODEL_061fcc6e5f3c44c48cde212c1ba515e5", + "value": "diffusion_pytorch_model.fp16.safetensors: 100%" + } + }, + "971f59d5f3e04697b3ab3c39ec0fc667": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9784d9210d6e4214b78ab5f8c33e8044": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "979c68bd2e224a40b939c27f32c25dac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9810873713024e79ae6d338dfeae5876": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "989c54778eb7469fb91e4337d6f49b0b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "993e5303f107468f83dbf51026e64301": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83afce4becac4f37ba916bf1901346ff", + "placeholder": "​", + "style": "IPY_MODEL_071e40d45ad14fc19b1480927d15d2ae", + "value": "tokenizer_2/special_tokens_map.json: 100%" + } + }, + "99717def9b6b4afe8a411a3bb83320c9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a37ffb9810f482b80f245f64947c371": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9a9da0d0e3d84a19b5d188c9bd6a83bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9ad4192f6f244264aede1b3c8ac3c57a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fdb799739700447d8a5198f1f4f9b17f", + "placeholder": "​", + "style": "IPY_MODEL_683db0ff105a47038c350cfc74b88345", + "value": "diffusion_pytorch_model.fp16.safetensors: 100%" + } + }, + "9c6ab017f5fb46eda8e3c22e9dd2b838": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d74948f7bcb498c9295e41e690a0a8d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9e2e87c131a140a2a37dfdf483d27ced": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9fb982789fbc4582bceb356e351db438": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a0f8a1a3512443ac84799b95da70ca26": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a15c3a32bfd347a98c2a50d27cd5b9f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a16016987b6145b69caaac6712d72835": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c82cf8cf90ed4e93bccffcf75881a56d", + "IPY_MODEL_5a4c6dc09a1049c0adfca9834b045a25", + "IPY_MODEL_c4d83c2b37504473afe63209a178b4cd" + ], + "layout": "IPY_MODEL_fae9d16daace412492b048b012b8d6dc" + } + }, + "a287f197e4fe4f908e3ee0a0e6cb35cf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2d497a2ddb04d8fac8a4c0f8aa7f5dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a32666723b0e4b9883a78f56295c4356": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_979c68bd2e224a40b939c27f32c25dac", + "placeholder": "​", + "style": "IPY_MODEL_0027f3aa006d4276983691ad985ce91b", + "value": "Loading pipeline components...: 100%" + } + }, + "a3ce5829f8a640e79a19d737663b8474": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a48d3ad24e9744f7898d1f2c5a696ea2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a6295c7e7630444c9b7425b884ad9707": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a65484354d254516936fcb425917a4b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a6bb8206de044c74a03d1a64c801e742": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a6f2ce8830734be4b8efe5ca0e14e990": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de31002ed7fc475c915b4a29253108af", + "placeholder": "​", + "style": "IPY_MODEL_fbf3d268f30344b7864ce691d5bcb1f3", + "value": " 47.3k/47.3k [00:00<00:00, 3.74MB/s]" + } + }, + "a708db5805424449afa269b714ebb3b3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_24cbd9338ad94801aa11f4dcb2a867cb", + "placeholder": "​", + "style": "IPY_MODEL_c88fcf0d20154b1fb9b7e8a00116b5e6", + "value": "tokenizer/merges.txt: 100%" + } + }, + "a7767d5a440f4c819cdb87414f2187ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a91df5ceea8647d98f4d49701bb00969": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5269d405725946b3a657d3a8d7b25885", + "placeholder": "​", + "style": "IPY_MODEL_b4a697d8335c435ab21219716a1da022", + "value": " 609/609 [00:00<00:00, 49.0kB/s]" + } + }, + "a9ac3b2188594c19885bcdb9a659ecde": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a708db5805424449afa269b714ebb3b3", + "IPY_MODEL_6d492ddabcbe4d65b5d311834865ab92", + "IPY_MODEL_e17e3253c16743a29f09b82d23c3b26d" + ], + "layout": "IPY_MODEL_e67a69c294334b01974f8bef36f133a0" + } + }, + "aae194e7f71f4eba80ffd18f91f083a8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9ad4192f6f244264aede1b3c8ac3c57a", + "IPY_MODEL_90558769855c4bb08ff2fe4af940c45b", + "IPY_MODEL_75384779c5d540d28afb53a0e318b674" + ], + "layout": "IPY_MODEL_a287f197e4fe4f908e3ee0a0e6cb35cf" + } + }, + "ab9c869439a94bddbbc0c6098f4c5b2a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_13ac509e0d5f42e9bbb6deda62f77923", + "max": 725, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a4a50a17c014f189b52119901104d79", + "value": 725 + } + }, + "abaecfe7f39a43bb8fbd655d1a3009f4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_05700817adac4fdda6c95dd00eaeae38", + "placeholder": "​", + "style": "IPY_MODEL_5a7d87338ddc45df84080e0096a31631", + "value": "pixel/adapter_config.json: 100%" + } + }, + "abf2248c725b4837b5c2babef7f4ff3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d872c3900b8b4275b2224c9ec5e7d78f", + "placeholder": "​", + "style": "IPY_MODEL_4fbaa1bd51bf4b1e90337a435604d2bd", + "value": " 170M/170M [00:12<00:00, 18.9MB/s]" + } + }, + "ad233dfd52034b9e8c6c2c5b86995717": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "adb2daf9d62f49f8ab1f0144b717d41e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bba38c266ceb4f30bb4bc1eaf5e3aa96", + "IPY_MODEL_6926c8dd4e4e46d089bb387333691df7", + "IPY_MODEL_abf2248c725b4837b5c2babef7f4ff3e" + ], + "layout": "IPY_MODEL_39e4bdf8d621451f984f6e7302fd6961" + } + }, + "adf53d97af214cceaae895c8abfbd909": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aed53b6480de4dd4bc7463af04840952": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "aef81ec1a7e844f883beba8c5754a8af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "af5003cf40ae4dfaa0660f247598856e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b0abce2d8a2046dba320e788e33e9d66": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1d427082b9f452eb6546c7d55016b36": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b25e822b3f77431fb72b4780067c90d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_55177afb435f44898df300143703e4d8", + "IPY_MODEL_b79d3f5bd8024451bc7148ba2a5029bd", + "IPY_MODEL_eacd646e2b984e60ab603bfc6d631de8" + ], + "layout": "IPY_MODEL_d158a80f4186411a8a9c335a8d5a888e" + } + }, + "b27ac2aaff694dd5999ab2cba91195da": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_556730e12d5d4e0ea51a0dd1b5aac331", + "IPY_MODEL_584dd148b2344bdc92f1d0850399aed7", + "IPY_MODEL_ccc5bdc185a84901994577ff7f1bc962" + ], + "layout": "IPY_MODEL_2c75632d8fdc4458814055172d1d72c3" + } + }, + "b27c74f0c7bf493fa8bcb4c5b9c9c100": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b28a2b59aab9478e9ac3ab24a1de6f9a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b3eaa188cc2e48d081488eea4ed2971f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b418ff1733db4efbab1b00b632b894e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b44f7154c55146a3bf5f4bd9e438086f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b4a697d8335c435ab21219716a1da022": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b600178b161d4a87a0b832a169d7caf2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b79d32186443469d94836b663bf156b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b79d3f5bd8024451bc7148ba2a5029bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ea257c1c73524141b87ab3c1ef85c908", + "max": 737, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c474ed5e340146baae6c38f62013afe3", + "value": 737 + } + }, + "b999b3e3af3744a79c0c90657ef37d4e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_713dec1904ce46f6b2d5a9b7e3e0373a", + "placeholder": "​", + "style": "IPY_MODEL_a6bb8206de044c74a03d1a64c801e742", + "value": "" + } + }, + "b9feee6f48bd49209e72e5c5e3136f67": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_daad0d12aff8470d990fbbbbe19d5891", + "placeholder": "​", + "style": "IPY_MODEL_16e18e872ab64b3f8bfb32f580c3371b", + "value": " 30/30 [00:03<00:00,  8.40it/s]" + } + }, + "ba9b002888b448738ba4b127e1046f5d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bab2458ba3f54b229828ad9c8706aba4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "babce0e85baf4279ae1d22d64006667f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bba38c266ceb4f30bb4bc1eaf5e3aa96": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4abac6a83641414499f9b6b1514d1695", + "placeholder": "​", + "style": "IPY_MODEL_bf6e103b43844f17968eadf223a42acb", + "value": "adapter_model.safetensors: 100%" + } + }, + "bc07cdaad5b64fb3b0e1f8c214bba813": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bcb3ec98d25b4c138c5e8f84c1e937c6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bd176c410a4e48a382a1b688e77aafcb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bd2740e191a74558a77a965b4f2d7f28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_48f51c96b7574946bf9542633eb39135", + "placeholder": "​", + "style": "IPY_MODEL_35c810f4bfe741f091f172cede413950", + "value": " 0/0 [00:00<?, ?it/s]" + } + }, + "bdfe4e4109a14a41bcd2e1e4242d82bb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "beedc3b275c24432a2e959dcf9dd418d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bf6e103b43844f17968eadf223a42acb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c008577d922e436aabb8680ca0d13117": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c040aa1f65514be28f0ca8ecdd1f69e4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c05842ed12c848c68c4a69de9aa742a8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c08373a044204308ac882dd8cf9cdd3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a0f8a1a3512443ac84799b95da70ca26", + "max": 7, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e08d739f59874064994212363a307f6e", + "value": 7 + } + }, + "c08fca70486040069d4f8f1df46a1074": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c181408b9b2b437ca11d584e3d1e94e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c212598c5a8747f783a6efc18816e868": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c292a598350d4dd4bb9b70aab1320c29": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cdd8f9b1592842b48e1ffa80f8ec8246", + "placeholder": "​", + "style": "IPY_MODEL_ba9b002888b448738ba4b127e1046f5d", + "value": " 246M/246M [00:00<00:00, 345MB/s]" + } + }, + "c474ed5e340146baae6c38f62013afe3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c4d83c2b37504473afe63209a178b4cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_70849993c0c94ecf87c56e430f06181d", + "placeholder": "​", + "style": "IPY_MODEL_e820c557697648378966ed0a073826c8", + "value": " 472/472 [00:00<00:00, 10.4kB/s]" + } + }, + "c68a5ad3bb664e8785e724085d208e96": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c6bdeef396174d51af9eee277752bec7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_685797f8907c47ffab4fe7a81ca22e63", + "IPY_MODEL_414f8301f76043758f69bbfb6960072d", + "IPY_MODEL_23c0492f021a4b60b1d84b0b82d15378" + ], + "layout": "IPY_MODEL_550b3ad10fcb422eb66f71ad95988616" + } + }, + "c7cf10df8d7944aeb93947d5f4156c92": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de0b54a59d9f47408d92915ad746cd5e", + "placeholder": "​", + "style": "IPY_MODEL_eb9a5f255fa0447eba6a33e1c30ba166", + "value": " 167M/167M [00:08<00:00, 18.9MB/s]" + } + }, + "c80901426c87439481078b9da2e0c772": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_416f31db79fd431fb8dc06e994421b70", + "IPY_MODEL_1971fcd35a564c449b4f437dac46058f", + "IPY_MODEL_76a474e4aab14a5987cf25d1391d578c" + ], + "layout": "IPY_MODEL_ff3bf3f1873c4b01b0a547dfe02923ce" + } + }, + "c8156b0cc68e4b3693dcabc530a4ea9a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c82cf8cf90ed4e93bccffcf75881a56d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_421b59f4021d4c4d930c51b6b4c7071f", + "placeholder": "​", + "style": "IPY_MODEL_60cc7415644d4e16b88f8fc5896b4b3b", + "value": "tokenizer/special_tokens_map.json: 100%" + } + }, + "c8555cb30ade40b9bd9a3435d6deeb67": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c88fcf0d20154b1fb9b7e8a00116b5e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c8e40d44aeac47a78f6502771f1471b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a48d3ad24e9744f7898d1f2c5a696ea2", + "placeholder": "​", + "style": "IPY_MODEL_f29673e57d174839a0bde70bfa165715", + "value": "adapter_model.safetensors: 100%" + } + }, + "c995197e66e04874a9f5d34db98b8890": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83f2d1dfaba54da38f0421b69930c3c1", + "placeholder": "​", + "style": "IPY_MODEL_577cc4b4f27941189c62951046db24ec", + "value": " 575/575 [00:00<00:00, 11.1kB/s]" + } + }, + "c9f764b5036042af9f1505e3729cbc32": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b999b3e3af3744a79c0c90657ef37d4e", + "IPY_MODEL_f3baef4fbf4b4ec08480522be921f841", + "IPY_MODEL_bd2740e191a74558a77a965b4f2d7f28" + ], + "layout": "IPY_MODEL_6f6a6cfd50404f1ea09f83e95b04550a" + } + }, + "ca5dc8342ef946a49d9e67a21f1a67c8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "caa46820018f47abab4a962afe51cc34": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aed53b6480de4dd4bc7463af04840952", + "max": 30, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_447d29db05384c55a57c5fb1bd121af4", + "value": 30 + } + }, + "cb89ecd5a8c14051985495da1797a202": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_61db57127c5845759360bdf8b29dac2d", + "IPY_MODEL_ab9c869439a94bddbbc0c6098f4c5b2a", + "IPY_MODEL_587bd0b90afa450ba82e49cf86ee135d" + ], + "layout": "IPY_MODEL_955ec2551f8b400dbbdba68d7449de76" + } + }, + "cb9a536bc56f4d0ebf285e7f73d4730e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cc06ed7338b74ae6a1c563212aeb9f94": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ccc5bdc185a84901994577ff7f1bc962": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_439cf6c6f9e845cea4008e3219454ff4", + "placeholder": "​", + "style": "IPY_MODEL_460c3c96d1724713b78bddcfc1f3eb97", + "value": " 5.14G/5.14G [00:14<00:00, 393MB/s]" + } + }, + "cd08b99de03c483a965248ff4df752ba": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c05842ed12c848c68c4a69de9aa742a8", + "max": 7, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_18e3ba4a61784f04b3238cf273c90c3e", + "value": 7 + } + }, + "cdd8f9b1592842b48e1ffa80f8ec8246": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf42fc299989442f94f4a9df63005ab4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cfc6ec59d45d42d187f42061902abbfb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d158a80f4186411a8a9c335a8d5a888e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d18c97fe685e4be080125ae770526255": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e44c018bc76c48cd8738fee5966767ce", + "IPY_MODEL_57b491647f1e49cea7ce34774a963936", + "IPY_MODEL_484f694f734b4a92a14ecc4d048db0af" + ], + "layout": "IPY_MODEL_bcb3ec98d25b4c138c5e8f84c1e937c6" + } + }, + "d1e4d2fd70e644f986104c998db7e53b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f786a0f386f6486083c15e576f6eb3e7", + "IPY_MODEL_6715d91c6b62426aa49c344b65bcd8a2", + "IPY_MODEL_0b0a85e1133c4ca3ba716e2403511703" + ], + "layout": "IPY_MODEL_017dc44dbe1c4de491e003a1e279a218" + } + }, + "d2cd2572973249baa8e130b5777f4147": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b27c74f0c7bf493fa8bcb4c5b9c9c100", + "placeholder": "​", + "style": "IPY_MODEL_09638c8da74f4dddaf1d5d94dd8ad885", + "value": "model.fp16.safetensors: 100%" + } + }, + "d2ed1988cf8c4bcdb1793b3d5068efed": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d33c90e8ea0945d397659f6a90cf51b6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d35b3848508c4b6390c243866649439d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d70508c304794bc79e80aab136eaf65a", + "max": 167335342, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_01b20535e40b47068723073ac6c819ee", + "value": 167335342 + } + }, + "d37c6ac25fd34a65bf307896443a5063": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_42a623abc5d84c0eb7de4e5323bb6546", + "placeholder": "​", + "style": "IPY_MODEL_a65484354d254516936fcb425917a4b7", + "value": " 7/7 [00:01<00:00,  6.68it/s]" + } + }, + "d3b73a841e68425994632d0f05cf4f16": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d432098a941c463599648b156abea24b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d62465d39d7e4265832901e9b9707993", + "placeholder": "​", + "style": "IPY_MODEL_4f56214f69034077bafdfdabc1c2aebf", + "value": " 460/460 [00:00<00:00, 33.3kB/s]" + } + }, + "d488374da5e74ec3a5973003590a7d69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cb9a536bc56f4d0ebf285e7f73d4730e", + "max": 47271, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_49bb475a11104e9496f2623e3d5caebd", + "value": 47271 + } + }, + "d4baf7891f854c9c9898314633f97356": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d4f1dbe4ce244abc987b1089876e080f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d62465d39d7e4265832901e9b9707993": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d70508c304794bc79e80aab136eaf65a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d727913663634e368ade4a7dc64fe74b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d987907f09084d44b452f939aadff65e", + "placeholder": "​", + "style": "IPY_MODEL_e976b994189343f5ba7d762ef92c79e2", + "value": "toy_face_sdxl.safetensors: 100%" + } + }, + "d74b1c667d42472b865ee1cbefc33a60": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1074a274530e46c5b5a3e43653da43d0", + "IPY_MODEL_dfde3984062442869bc8091bb94f2c36", + "IPY_MODEL_a6f2ce8830734be4b8efe5ca0e14e990" + ], + "layout": "IPY_MODEL_07da49088e8d480fad03a2e828357872" + } + }, + "d872c3900b8b4275b2224c9ec5e7d78f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d8bbb7402f3e44b2899fc98f02cee87e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_19679186751b42e3ad2c44ea46c82a9c", + "placeholder": "​", + "style": "IPY_MODEL_d4baf7891f854c9c9898314633f97356", + "value": "Fetching 17 files: 100%" + } + }, + "d987907f09084d44b452f939aadff65e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d99a364420454ba5bfd510d1226b94af": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d327c9e91b34c7b84cedd8f9660e9fd", + "placeholder": "​", + "style": "IPY_MODEL_b44f7154c55146a3bf5f4bd9e438086f", + "value": " 1.68k/1.68k [00:00<00:00, 126kB/s]" + } + }, + "daad0d12aff8470d990fbbbbe19d5891": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "db862ffbb44d450db514173df4c7f301": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fe958df746be4dc1871bd58628697c3c", + "max": 1059962, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_54417d8b9c5249d0a028d9e831dd8be6", + "value": 1059962 + } + }, + "db8fd6b2687c449fa0600d3e87c96999": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_355efc45ddaf42498d72d6134a28c87b", + "placeholder": "​", + "style": "IPY_MODEL_8b6464ce614c4aa29ac66ecce29b6cbf", + "value": "adapter_model.safetensors: 100%" + } + }, + "dc4391fe30694a788134fffdd2a23d1a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dd42a2ff90854b74ba5fde1de26b4e15": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dd9666d76af04b72b08f59023eb04ee3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ddbeb13bb8174fc0b7d5543108d1c4f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9a37ffb9810f482b80f245f64947c371", + "placeholder": "​", + "style": "IPY_MODEL_bdfe4e4109a14a41bcd2e1e4242d82bb", + "value": " 1.06M/1.06M [00:00<00:00, 42.3MB/s]" + } + }, + "de0b54a59d9f47408d92915ad746cd5e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de31002ed7fc475c915b4a29253108af": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de583920d3b54774a486aef4c052e50d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dfabe7aa70024d1aa868ef5e6650dc6d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_60e8e9f8e6ce40dd910ce1a9410b5e24", + "placeholder": "​", + "style": "IPY_MODEL_6703a0417c474db5bf261fe8679051e9", + "value": " 1.06M/1.06M [00:00<00:00, 1.49MB/s]" + } + }, + "dfde3984062442869bc8091bb94f2c36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c6c4dff0a814bc6a7ac677980b45add", + "max": 47271, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9a9da0d0e3d84a19b5d188c9bd6a83bb", + "value": 47271 + } + }, + "e08d739f59874064994212363a307f6e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e15aab8dd01f4d5582db80e6ad9931fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e17e3253c16743a29f09b82d23c3b26d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a7767d5a440f4c819cdb87414f2187ff", + "placeholder": "​", + "style": "IPY_MODEL_f0bc6b14a299445ca705b888b3047064", + "value": " 525k/525k [00:00<00:00, 9.01MB/s]" + } + }, + "e3a1a5e9f29d4d28b0b9496493dafa21": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e44c018bc76c48cd8738fee5966767ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0abce2d8a2046dba320e788e33e9d66", + "placeholder": "​", + "style": "IPY_MODEL_aef81ec1a7e844f883beba8c5754a8af", + "value": "pixel-art-xl.safetensors: 100%" + } + }, + "e57d317b3dda43bba13ecd4514f776d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3a017f1d0ebf4a4aab57ac3eb0788774", + "placeholder": "​", + "style": "IPY_MODEL_48643ea67f2f4762bcd27de1d4cf0fd2", + "value": " 170M/170M [00:07<00:00, 24.6MB/s]" + } + }, + "e67a69c294334b01974f8bef36f133a0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e820c557697648378966ed0a073826c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e87dffe17f1948e9ba794eddb605a908": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e976b994189343f5ba7d762ef92c79e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ea0910fc31e44597968b2129272cc94d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0371aa5607604c06a868deb2a413cb31", + "max": 30, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c212598c5a8747f783a6efc18816e868", + "value": 30 + } + }, + "ea257c1c73524141b87ab3c1ef85c908": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eacd646e2b984e60ab603bfc6d631de8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2ece04bc10934b3cb9d383abfc5ccd6e", + "placeholder": "​", + "style": "IPY_MODEL_67a03631ebc54f99928f0feb18ab38af", + "value": " 737/737 [00:00<00:00, 41.5kB/s]" + } + }, + "eb73095c804a4272856fe348fa3cb1e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9810873713024e79ae6d338dfeae5876", + "max": 170543292, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2e18ce21c01a4a3ca992622957e7d297", + "value": 170543292 + } + }, + "eb9a5f255fa0447eba6a33e1c30ba166": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee8407365f5d42d9b98536152c9efe92": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eec76868d92f45d7a6db2e232a45e0c2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "efd33121bdfc4ce195a07c9ef523a477": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_d2cd2572973249baa8e130b5777f4147", + "IPY_MODEL_4e77d8b6cdb94fc68974d27b24cfb3fd", + "IPY_MODEL_c292a598350d4dd4bb9b70aab1320c29" + ], + "layout": "IPY_MODEL_c68a5ad3bb664e8785e724085d208e96" + } + }, + "f02041b1d5e1485bb2ba02b00fc2c242": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f0bc6b14a299445ca705b888b3047064": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2596717405c40e1a39b721386e7a972": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f29673e57d174839a0bde70bfa165715": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2c67c29e1224df3b2def5a87eb8d368": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "20px" + } + }, + "f3baef4fbf4b4ec08480522be921f841": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f2c67c29e1224df3b2def5a87eb8d368", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_486282a4ead148868005c592d74a4ed4", + "value": 0 + } + }, + "f46df85f441e4ada831f0e2b142f296a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f668dd13af6f41d8be358f7db5261c54": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f786a0f386f6486083c15e576f6eb3e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e3a1a5e9f29d4d28b0b9496493dafa21", + "placeholder": "​", + "style": "IPY_MODEL_c008577d922e436aabb8680ca0d13117", + "value": "Loading pipeline components...: 100%" + } + }, + "f8e6babf4fdd4c8e80d6dd24ff22d464": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f9ca5d4810b34938b6f997ff66a8d541": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fa7876ade8e240fc89a35a1f8c7c7d3c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52c7e22284b0468c8bc0c3b1cad047fb", + "placeholder": "​", + "style": "IPY_MODEL_17c29bbcbc0c437c9c8bc83e0b085f1a", + "value": " 7/7 [00:01<00:00,  6.49it/s]" + } + }, + "faddc146c69545cdaeb81edc8a0cda70": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b600178b161d4a87a0b832a169d7caf2", + "placeholder": "​", + "style": "IPY_MODEL_b3eaa188cc2e48d081488eea4ed2971f", + "value": "Loading pipeline components...: 100%" + } + }, + "fae9d16daace412492b048b012b8d6dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fbf3d268f30344b7864ce691d5bcb1f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fc1391aaeaad4eecad967e800a669ec1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd0ff16b68b2488d8c31ebe700dee9c9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0e49d820da754da785bec2e5940eb9f6", + "IPY_MODEL_0b53b908088648e3b2beadaeba0f5da1", + "IPY_MODEL_804e7ee768794bba88aec3137f418868" + ], + "layout": "IPY_MODEL_0d786d8386ba49d0b53a5452d52e722d" + } + }, + "fd13a58d6b444a0f955832647f64df12": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fd672cd5ba0c4695be4240707dcf4bf3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fdb1fd279a0241429e5721ae2e92d217": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fdb799739700447d8a5198f1f4f9b17f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe19dcea6d9a44d28f077e065f1671c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fe958df746be4dc1871bd58628697c3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff3bf3f1873c4b01b0a547dfe02923ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/peft/examples/multilayer_perceptron/README.md b/peft/examples/multilayer_perceptron/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fa3b05e2cb5487791ddbea5a3945ef6a0923c04e --- /dev/null +++ b/peft/examples/multilayer_perceptron/README.md @@ -0,0 +1,5 @@ +# Fine-tuning a multilayer perceptron using LoRA and 🤗 PEFT + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb) + +PEFT supports fine-tuning any type of model as long as the layers being used are supported. The model does not have to be a transformers model, for instance. To demonstrate this, the accompanying notebook `multilayer_perceptron_lora.ipynb` shows how to apply LoRA to a simple multilayer perceptron and use it to train a model to perform a classification task. diff --git a/peft/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb b/peft/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d5ce302e0754a5cf283471b299ec563a05a8a086 --- /dev/null +++ b/peft/examples/multilayer_perceptron/multilayer_perceptron_lora.ipynb @@ -0,0 +1,752 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8e8743c8", + "metadata": {}, + "source": [ + "# Using PEFT with custom models" + ] + }, + { + "cell_type": "markdown", + "id": "c42c67e1", + "metadata": {}, + "source": [ + "`peft` allows us to fine-tune models efficiently with LoRA. In this short notebook, we will demonstrate how to train a simple multilayer perceptron (MLP) using `peft`." + ] + }, + { + "cell_type": "markdown", + "id": "ce314af5", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "markdown", + "id": "b28b214d", + "metadata": {}, + "source": [ + "Make sure that you have the latest version of `peft` installed. To ensure that, run this in your Python environment:\n", + " \n", + " python -m pip install --upgrade peft" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4d9da3d9", + "metadata": {}, + "outputs": [], + "source": [ + "import copy\n", + "import os\n", + "\n", + "# ignore bnb warnings\n", + "os.environ[\"BITSANDBYTES_NOWELCOME\"] = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "44075f54", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import peft\n", + "import torch\n", + "from torch import nn\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f72acdfb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "2b127a78", + "metadata": {}, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "id": "f265da76", + "metadata": {}, + "source": [ + "We will create a toy dataset consisting of random data for a classification task. There is a little bit of signal in the data, so we should expect that the loss of the model can improve during training." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b355567e", + "metadata": {}, + "outputs": [], + "source": [ + "X = torch.rand((1000, 20))\n", + "y = (X.sum(1) > 10).long()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a60a869d", + "metadata": {}, + "outputs": [], + "source": [ + "n_train = 800\n", + "batch_size = 64" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8859572e", + "metadata": {}, + "outputs": [], + "source": [ + "train_dataloader = torch.utils.data.DataLoader(\n", + " torch.utils.data.TensorDataset(X[:n_train], y[:n_train]),\n", + " batch_size=batch_size,\n", + " shuffle=True,\n", + ")\n", + "eval_dataloader = torch.utils.data.DataLoader(\n", + " torch.utils.data.TensorDataset(X[n_train:], y[n_train:]),\n", + " batch_size=batch_size,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "97bddd2c", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "db694a58", + "metadata": {}, + "source": [ + "As a model, we use a simple multilayer perceptron (MLP). For demonstration purposes, we use a very large number of hidden units. This is totally overkill for this task but it helps to demonstrate the advantages of `peft`. In more realistic settings, models will also be quite large on average, so this is not far-fetched." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1b43cd8f", + "metadata": {}, + "outputs": [], + "source": [ + "class MLP(nn.Module):\n", + " def __init__(self, num_units_hidden=2000):\n", + " super().__init__()\n", + " self.seq = nn.Sequential(\n", + " nn.Linear(20, num_units_hidden),\n", + " nn.ReLU(),\n", + " nn.Linear(num_units_hidden, num_units_hidden),\n", + " nn.ReLU(),\n", + " nn.Linear(num_units_hidden, 2),\n", + " nn.LogSoftmax(dim=-1),\n", + " )\n", + "\n", + " def forward(self, X):\n", + " return self.seq(X)" + ] + }, + { + "cell_type": "markdown", + "id": "1277bf00", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "id": "02caf26a", + "metadata": {}, + "source": [ + "Here are just a few training hyper-parameters and a simple function that performs the training and evaluation loop." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5d14c0c4", + "metadata": {}, + "outputs": [], + "source": [ + "lr = 0.002\n", + "batch_size = 64\n", + "max_epochs = 30\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "657d6b3e", + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, optimizer, criterion, train_dataloader, eval_dataloader, epochs):\n", + " for epoch in range(epochs):\n", + " model.train()\n", + " train_loss = 0\n", + " for xb, yb in train_dataloader:\n", + " xb = xb.to(device)\n", + " yb = yb.to(device)\n", + " outputs = model(xb)\n", + " loss = criterion(outputs, yb)\n", + " train_loss += loss.detach().float()\n", + " loss.backward()\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + "\n", + " model.eval()\n", + " eval_loss = 0\n", + " for xb, yb in eval_dataloader:\n", + " xb = xb.to(device)\n", + " yb = yb.to(device)\n", + " with torch.no_grad():\n", + " outputs = model(xb)\n", + " loss = criterion(outputs, yb)\n", + " eval_loss += loss.detach().float()\n", + "\n", + " eval_loss_total = (eval_loss / len(eval_dataloader)).item()\n", + " train_loss_total = (train_loss / len(train_dataloader)).item()\n", + " print(f\"{epoch=:<2} {train_loss_total=:.4f} {eval_loss_total=:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "b382dcbe", + "metadata": {}, + "source": [ + "### Training without peft" + ] + }, + { + "cell_type": "markdown", + "id": "b40d4873", + "metadata": {}, + "source": [ + "Let's start without using `peft` to see what we can expect from the model training." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f059ced4", + "metadata": {}, + "outputs": [], + "source": [ + "module = MLP().to(device)\n", + "optimizer = torch.optim.Adam(module.parameters(), lr=lr)\n", + "criterion = nn.CrossEntropyLoss()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "17698863", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0 train_loss_total=0.7970 eval_loss_total=0.6472\n", + "epoch=1 train_loss_total=0.5597 eval_loss_total=0.4898\n", + "epoch=2 train_loss_total=0.3696 eval_loss_total=0.3323\n", + "epoch=3 train_loss_total=0.2364 eval_loss_total=0.5454\n", + "epoch=4 train_loss_total=0.2428 eval_loss_total=0.2843\n", + "epoch=5 train_loss_total=0.1251 eval_loss_total=0.2514\n", + "epoch=6 train_loss_total=0.0952 eval_loss_total=0.2068\n", + "epoch=7 train_loss_total=0.0831 eval_loss_total=0.2395\n", + "epoch=8 train_loss_total=0.0655 eval_loss_total=0.2524\n", + "epoch=9 train_loss_total=0.0380 eval_loss_total=0.3650\n", + "epoch=10 train_loss_total=0.0363 eval_loss_total=0.3495\n", + "epoch=11 train_loss_total=0.0231 eval_loss_total=0.2360\n", + "epoch=12 train_loss_total=0.0162 eval_loss_total=0.2276\n", + "epoch=13 train_loss_total=0.0094 eval_loss_total=0.2716\n", + "epoch=14 train_loss_total=0.0065 eval_loss_total=0.2237\n", + "epoch=15 train_loss_total=0.0054 eval_loss_total=0.2366\n", + "epoch=16 train_loss_total=0.0035 eval_loss_total=0.2673\n", + "epoch=17 train_loss_total=0.0028 eval_loss_total=0.2630\n", + "epoch=18 train_loss_total=0.0023 eval_loss_total=0.2835\n", + "epoch=19 train_loss_total=0.0021 eval_loss_total=0.2727\n", + "epoch=20 train_loss_total=0.0018 eval_loss_total=0.2597\n", + "epoch=21 train_loss_total=0.0016 eval_loss_total=0.2553\n", + "epoch=22 train_loss_total=0.0014 eval_loss_total=0.2712\n", + "epoch=23 train_loss_total=0.0013 eval_loss_total=0.2637\n", + "epoch=24 train_loss_total=0.0012 eval_loss_total=0.2733\n", + "epoch=25 train_loss_total=0.0011 eval_loss_total=0.2738\n", + "epoch=26 train_loss_total=0.0010 eval_loss_total=0.2477\n", + "epoch=27 train_loss_total=0.0010 eval_loss_total=0.2584\n", + "epoch=28 train_loss_total=0.0009 eval_loss_total=0.2844\n", + "epoch=29 train_loss_total=0.0008 eval_loss_total=0.2633\n", + "CPU times: user 1.31 s, sys: 236 ms, total: 1.54 s\n", + "Wall time: 1.56 s\n" + ] + } + ], + "source": [ + "%time train(module, optimizer, criterion, train_dataloader, eval_dataloader, epochs=max_epochs)" + ] + }, + { + "cell_type": "markdown", + "id": "4cef0029", + "metadata": {}, + "source": [ + "Okay, so we got an eval loss of ~0.26, which is much better than random." + ] + }, + { + "cell_type": "markdown", + "id": "4f106078", + "metadata": {}, + "source": [ + "### Training with peft" + ] + }, + { + "cell_type": "markdown", + "id": "8dd47aa4", + "metadata": {}, + "source": [ + "Now let's train with `peft`. First we check the names of the modules, so that we can configure `peft` to fine-tune the right modules." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "922db29b", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('', __main__.MLP),\n", + " ('seq', torch.nn.modules.container.Sequential),\n", + " ('seq.0', torch.nn.modules.linear.Linear),\n", + " ('seq.1', torch.nn.modules.activation.ReLU),\n", + " ('seq.2', torch.nn.modules.linear.Linear),\n", + " ('seq.3', torch.nn.modules.activation.ReLU),\n", + " ('seq.4', torch.nn.modules.linear.Linear),\n", + " ('seq.5', torch.nn.modules.activation.LogSoftmax)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(n, type(m)) for n, m in MLP().named_modules()]" + ] + }, + { + "cell_type": "markdown", + "id": "5efb275d", + "metadata": {}, + "source": [ + "Next we can define the LoRA config. There is nothing special going on here. We set the LoRA rank to 8 and select the layers `seq.0` and `seq.2` to be used for LoRA fine-tuning. As for `seq.4`, which is the output layer, we set it as `module_to_save`, which means it is also trained but no LoRA is applied." + ] + }, + { + "cell_type": "markdown", + "id": "cf2c608d", + "metadata": {}, + "source": [ + "*Note: Not all layers types can be fine-tuned with LoRA. At the moment, linear layers, embeddings, `Conv2D` and `transformers.pytorch_utils.Conv1D` are supported." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b342438f", + "metadata": {}, + "outputs": [], + "source": [ + "config = peft.LoraConfig(\n", + " r=8,\n", + " target_modules=[\"seq.0\", \"seq.2\"],\n", + " modules_to_save=[\"seq.4\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "829b4e2d", + "metadata": {}, + "source": [ + "Now let's create the `peft` model by passing our initial MLP, as well as the config we just defined, to `get_peft_model`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "602b6658", + "metadata": {}, + "outputs": [], + "source": [ + "module = MLP().to(device)\n", + "module_copy = copy.deepcopy(module) # we keep a copy of the original model for later\n", + "peft_model = peft.get_peft_model(module, config)\n", + "optimizer = torch.optim.Adam(peft_model.parameters(), lr=lr)\n", + "criterion = nn.CrossEntropyLoss()\n", + "peft_model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "2103737d", + "metadata": {}, + "source": [ + "Checking the numbers, we see that only ~1% of parameters are actually trained, which is what we like to see.\n", + "\n", + "Now let's start the training:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9200cbc6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epoch=0 train_loss_total=0.6695 eval_loss_total=0.6388\n", + "epoch=1 train_loss_total=0.5614 eval_loss_total=0.5456\n", + "epoch=2 train_loss_total=0.3897 eval_loss_total=0.3035\n", + "epoch=3 train_loss_total=0.2529 eval_loss_total=0.2510\n", + "epoch=4 train_loss_total=0.1914 eval_loss_total=0.2191\n", + "epoch=5 train_loss_total=0.1236 eval_loss_total=0.2586\n", + "epoch=6 train_loss_total=0.1076 eval_loss_total=0.3205\n", + "epoch=7 train_loss_total=0.1834 eval_loss_total=0.3951\n", + "epoch=8 train_loss_total=0.1037 eval_loss_total=0.1646\n", + "epoch=9 train_loss_total=0.0724 eval_loss_total=0.1409\n", + "epoch=10 train_loss_total=0.0691 eval_loss_total=0.1725\n", + "epoch=11 train_loss_total=0.0641 eval_loss_total=0.1423\n", + "epoch=12 train_loss_total=0.0382 eval_loss_total=0.1490\n", + "epoch=13 train_loss_total=0.0214 eval_loss_total=0.1517\n", + "epoch=14 train_loss_total=0.0119 eval_loss_total=0.1717\n", + "epoch=15 train_loss_total=0.0060 eval_loss_total=0.2366\n", + "epoch=16 train_loss_total=0.0029 eval_loss_total=0.2069\n", + "epoch=17 train_loss_total=0.0021 eval_loss_total=0.2082\n", + "epoch=18 train_loss_total=0.0016 eval_loss_total=0.2119\n", + "epoch=19 train_loss_total=0.0011 eval_loss_total=0.1984\n", + "epoch=20 train_loss_total=0.0010 eval_loss_total=0.1821\n", + "epoch=21 train_loss_total=0.0009 eval_loss_total=0.1892\n", + "epoch=22 train_loss_total=0.0007 eval_loss_total=0.2062\n", + "epoch=23 train_loss_total=0.0006 eval_loss_total=0.2408\n", + "epoch=24 train_loss_total=0.0006 eval_loss_total=0.2038\n", + "epoch=25 train_loss_total=0.0005 eval_loss_total=0.2374\n", + "epoch=26 train_loss_total=0.0004 eval_loss_total=0.2139\n", + "epoch=27 train_loss_total=0.0004 eval_loss_total=0.2085\n", + "epoch=28 train_loss_total=0.0004 eval_loss_total=0.2395\n", + "epoch=29 train_loss_total=0.0003 eval_loss_total=0.2100\n", + "CPU times: user 1.41 s, sys: 48.9 ms, total: 1.46 s\n", + "Wall time: 1.46 s\n" + ] + } + ], + "source": [ + "%time train(peft_model, optimizer, criterion, train_dataloader, eval_dataloader, epochs=max_epochs)" + ] + }, + { + "cell_type": "markdown", + "id": "20f6f452", + "metadata": {}, + "source": [ + "In the end, we see that the eval loss is very similar to the one we saw earlier when we trained without `peft`. This is quite nice to see, given that we are training a much smaller number of parameters." + ] + }, + { + "cell_type": "markdown", + "id": "fa55d1d4", + "metadata": {}, + "source": [ + "#### Check which parameters were updated" + ] + }, + { + "cell_type": "markdown", + "id": "a6e2146b", + "metadata": {}, + "source": [ + "Finally, just to check that LoRA was applied as expected, we check what original weights were updated what weights stayed the same." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c7dcde21", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "New parameter model.seq.0.lora_A.default.weight | 160 parameters | updated\n", + "New parameter model.seq.0.lora_B.default.weight | 16000 parameters | updated\n", + "New parameter model.seq.2.lora_A.default.weight | 16000 parameters | updated\n", + "New parameter model.seq.2.lora_B.default.weight | 16000 parameters | updated\n" + ] + } + ], + "source": [ + "for name, param in peft_model.base_model.named_parameters():\n", + " if \"lora\" not in name:\n", + " continue\n", + "\n", + " print(f\"New parameter {name:<13} | {param.numel():>5} parameters | updated\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "022e6c41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parameter seq.0.weight | 40000 parameters | not updated\n", + "Parameter seq.0.bias | 2000 parameters | not updated\n", + "Parameter seq.2.weight | 4000000 parameters | not updated\n", + "Parameter seq.2.bias | 2000 parameters | not updated\n", + "Parameter seq.4.weight | 4000 parameters | not updated\n", + "Parameter seq.4.bias | 2 parameters | not updated\n", + "Parameter seq.4.weight | 4000 parameters | updated\n", + "Parameter seq.4.bias | 2 parameters | updated\n" + ] + } + ], + "source": [ + "params_before = dict(module_copy.named_parameters())\n", + "for name, param in peft_model.base_model.named_parameters():\n", + " if \"lora\" in name:\n", + " continue\n", + "\n", + " name_before = (\n", + " name.partition(\".\")[-1].replace(\"base_layer.\", \"\").replace(\"original_\", \"\").replace(\"module.\", \"\").replace(\"modules_to_save.default.\", \"\")\n", + " )\n", + " param_before = params_before[name_before]\n", + " if torch.allclose(param, param_before):\n", + " print(f\"Parameter {name_before:<13} | {param.numel():>7} parameters | not updated\")\n", + " else:\n", + " print(f\"Parameter {name_before:<13} | {param.numel():>7} parameters | updated\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c09b43d", + "metadata": {}, + "source": [ + "So we can see that apart from the new LoRA weights that were added, only the last layer was updated. Since the LoRA weights and the last layer have comparitively few parameters, this gives us a big boost in efficiency." + ] + }, + { + "cell_type": "markdown", + "id": "b46c6198", + "metadata": {}, + "source": [ + "## Sharing the model through Hugging Face Hub" + ] + }, + { + "cell_type": "markdown", + "id": "6289e647", + "metadata": {}, + "source": [ + "### Pushing the model to HF Hub" + ] + }, + { + "cell_type": "markdown", + "id": "06dcdfa0", + "metadata": {}, + "source": [ + "With the `peft` model, it is also very easy to push a model the Hugging Face Hub. Below, we demonstrate how it works. It is assumed that you have a valid Hugging Face account and are logged in:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1b91a0af", + "metadata": {}, + "outputs": [], + "source": [ + "user = \"BenjaminB\" # put your user name here\n", + "model_name = \"peft-lora-with-custom-model\"\n", + "model_id = f\"{user}/{model_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1430fffd", + "metadata": {}, + "outputs": [], + "source": [ + "peft_model.push_to_hub(model_id);" + ] + }, + { + "cell_type": "markdown", + "id": "632bd799", + "metadata": {}, + "source": [ + "As we can see, the adapter size is only 211 kB." + ] + }, + { + "cell_type": "markdown", + "id": "4ff78c0c", + "metadata": {}, + "source": [ + "### Loading the model from HF Hub" + ] + }, + { + "cell_type": "markdown", + "id": "e5c7e87f", + "metadata": {}, + "source": [ + "Now, it only takes one step to load the model from HF Hub. To do this, we can use `PeftModel.from_pretrained`, passing our base model and the model ID:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce0fcced", + "metadata": {}, + "outputs": [], + "source": [ + "loaded = peft.PeftModel.from_pretrained(module_copy, model_id)\n", + "type(loaded)" + ] + }, + { + "cell_type": "markdown", + "id": "cd4b4eac", + "metadata": {}, + "source": [ + "Let's check that the two models produce the same output:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2cf6ac4", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "y_peft = peft_model(X.to(device))\n", + "y_loaded = loaded(X.to(device))\n", + "torch.allclose(y_peft, y_loaded)" + ] + }, + { + "cell_type": "markdown", + "id": "eeeb653f", + "metadata": {}, + "source": [ + "### Clean up" + ] + }, + { + "cell_type": "markdown", + "id": "61c60355", + "metadata": {}, + "source": [ + "Finally, as a clean up step, you may want to delete the repo." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "b747038f", + "metadata": {}, + "outputs": [], + "source": [ + "from huggingface_hub import delete_repo" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7e5ab237", + "metadata": {}, + "outputs": [], + "source": [ + "delete_repo(model_id)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/oft_dreambooth/oft_dreambooth_inference.ipynb b/peft/examples/oft_dreambooth/oft_dreambooth_inference.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..24ed24218c66609b6db8a838415c9641257eb831 --- /dev/null +++ b/peft/examples/oft_dreambooth/oft_dreambooth_inference.ipynb @@ -0,0 +1,116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "acd7b15e", + "metadata": {}, + "source": [ + "# Dreambooth with OFT\n", + "This Notebook assumes that you already ran the train_dreambooth.py script to create your own adapter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acab479f", + "metadata": {}, + "outputs": [], + "source": [ + "from diffusers import DiffusionPipeline\n", + "from diffusers.utils import check_min_version, get_logger\n", + "from peft import PeftModel\n", + "\n", + "# Will error if the minimal version of diffusers is not installed. Remove at your own risks.\n", + "check_min_version(\"0.10.0.dev0\")\n", + "\n", + "logger = get_logger(__name__)\n", + "\n", + "BASE_MODEL_NAME = \"stabilityai/stable-diffusion-2-1-base\"\n", + "ADAPTER_MODEL_PATH = \"INSERT MODEL PATH HERE\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading pipeline components...: 100%|██████████| 6/6 [00:00<00:00, 24.13it/s]\n" + ] + } + ], + "source": [ + "import torch\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "pipe = DiffusionPipeline.from_pretrained(\n", + " BASE_MODEL_NAME,\n", + ")\n", + "pipe.to(device)\n", + "pipe.unet = PeftModel.from_pretrained(pipe.unet, ADAPTER_MODEL_PATH + \"/unet\", adapter_name=\"default\")\n", + "pipe.text_encoder = PeftModel.from_pretrained(\n", + " pipe.text_encoder, ADAPTER_MODEL_PATH + \"/text_encoder\", adapter_name=\"default\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 50/50 [00:11<00:00, 4.46it/s]\n" + ] + }, + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"A photo of a sks dog\"\n", + "image = pipe(\n", + " prompt,\n", + " num_inference_steps=50,\n", + " height=512,\n", + " width=512,\n", + ").images[0]\n", + "image" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/peft/examples/oft_dreambooth/train_dreambooth.py b/peft/examples/oft_dreambooth/train_dreambooth.py new file mode 100644 index 0000000000000000000000000000000000000000..60c87cb036d1c50ea9f8b81d2de1841807ca09e0 --- /dev/null +++ b/peft/examples/oft_dreambooth/train_dreambooth.py @@ -0,0 +1,1115 @@ +import argparse +import gc +import hashlib +import itertools +import logging +import math +import os +import threading +import warnings +from contextlib import nullcontext +from pathlib import Path + +import datasets +import diffusers +import numpy as np +import psutil +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import HfApi +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +from peft import get_peft_model +from peft.tuners.oft.config import OFTConfig + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.10.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = ["to_q", "to_v", "query", "value"] # , "ff.net.0.proj"] +TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj"] + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + # oft args + parser.add_argument("--use_oft", action="store_true", help="Whether to use OFT for parameter efficient tuning") + parser.add_argument("--oft_r", type=int, default=0, help="OFT rank, only used if use_oft is True") + parser.add_argument("--oft_block_size", type=int, default=32, help="OFT block size, only used if use_oft is True") + parser.add_argument("--oft_dropout", type=float, default=0.0, help="OFT dropout, only used if use_oft is True") + parser.add_argument( + "--oft_use_coft", action="store_true", help="Using constrained OFT, only used if use_oft is True" + ) + parser.add_argument( + "--oft_eps", + type=float, + default=0.0, + help="The control strength of COFT. Only has an effect if `oft_use_coft` is set to True.", + ) + + parser.add_argument( + "--oft_text_encoder_r", + type=int, + default=0, + help="OFT rank for text encoder, only used if `use_oft` and `train_text_encoder` are True", + ) + parser.add_argument( + "--oft_text_encoder_block_size", + type=int, + default=32, + help="OFT block size for text encoder, only used if `use_oft` and `train_text_encoder` are True", + ) + parser.add_argument( + "--oft_text_encoder_dropout", + type=float, + default=0.0, + help="OFT dropout for text encoder, only used if `use_oft` and `train_text_encoder` are True", + ) + parser.add_argument( + "--oft_text_encoder_use_coft", + action="store_true", + help="Using constrained OFT on the text encoder, only used if use_oft is True", + ) + parser.add_argument( + "--oft_text_encoder_eps", + type=float, + default=0.0, + help="The control strength of COFT on the text encoder. Only has an effect if `oft_text_encoder_use_coft` is set to True.", + ) + + parser.add_argument( + "--num_dataloader_workers", type=int, default=1, help="Num of workers for the training dataloader." + ) + + parser.add_argument( + "--no_tracemalloc", + default=False, + action="store_true", + help="Flag to stop memory allocation tracing during training. This could speed up training on Windows.", + ) + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of accelerators, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU or Intel XPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10.and an Nvidia Ampere GPU or Intel XPU. Default to fp16 if a GPU/XPU is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + gc.collect() + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def collate_fn(examples, with_prior_preservation=False): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple accelerators." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + if args.report_to == "wandb": + import wandb + + wandb.login(key=args.wandb_key) + wandb.init(project=args.wandb_project_name) + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDPMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + if args.use_oft: + config = OFTConfig( + r=args.oft_r, + oft_block_size=args.oft_block_size, + target_modules=UNET_TARGET_MODULES, + module_dropout=args.oft_dropout, + init_weights=True, + coft=args.oft_use_coft, + eps=args.oft_eps, + ) + unet = get_peft_model(unet, config) + unet.print_trainable_parameters() + print(unet) + + vae.requires_grad_(False) + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + elif args.train_text_encoder and args.use_oft: + config = OFTConfig( + r=args.oft_text_encoder_r, + oft_block_size=args.oft_text_encoder_block_size, + target_modules=TEXT_ENCODER_TARGET_MODULES, + module_dropout=args.oft_text_encoder_dropout, + init_weights=True, + coft=args.oft_text_encoder_use_coft, + eps=args.oft_text_encoder_eps, + ) + text_encoder = get_peft_model(text_encoder, config) + text_encoder.print_trainable_parameters() + print(text_encoder) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warn("XPU hasn't support xformers yet, ignore it.") + elif is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + # below fails when using oft so commenting it out + if args.train_text_encoder and not args.use_oft: + text_encoder.gradient_checkpointing_enable() + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32 and torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB accelerators + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=args.num_dataloader_workers, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("dreambooth", config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the mos recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = resume_global_step // num_update_steps_per_epoch + resume_step = resume_global_step % num_update_steps_per_epoch + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + if args.train_text_encoder: + text_encoder.train() + with TorchTracemalloc() if not args.no_tracemalloc else nullcontext() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if ( + args.validation_prompt is not None + and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0 + ): + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + revision=args.revision, + ) + # set `keep_fp32_wrapper` to True because we do not want to remove + # mixed precision hooks while we are still training + pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True) + pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + if args.seed is not None: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + else: + generator = None + images = [] + for _ in range(args.num_validation_images): + image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + import wandb + + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + if global_step >= args.max_train_steps: + break + + # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage + if not args.no_tracemalloc: + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + # Create the pipeline using using the trained modules and save it. + accelerator.wait_for_everyone() + if accelerator.is_main_process: + if args.use_oft: + unwarpped_unet = accelerator.unwrap_model(unet) + unwarpped_unet.save_pretrained( + os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet) + ) + if args.train_text_encoder: + unwarpped_text_encoder = accelerator.unwrap_model(text_encoder) + unwarpped_text_encoder.save_pretrained( + os.path.join(args.output_dir, "text_encoder"), + state_dict=accelerator.get_state_dict(text_encoder), + ) + else: + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + revision=args.revision, + ) + pipeline.save_pretrained(args.output_dir) + + if args.push_to_hub: + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + run_as_future=True, + ) + + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/olora_finetuning/README.md b/peft/examples/olora_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5e5e9b197ca4ddb186c21b84a432e1ea3083022c --- /dev/null +++ b/peft/examples/olora_finetuning/README.md @@ -0,0 +1,96 @@ +# OLoRA: Orthonormal Low Rank Adaptation of Large Language Models + +## Introduction +[OLoRA](https://huggingface.co/papers/2406.01775) is a novel approach that leverages orthonormal low rank adaptation through QR decomposition. Unlike the default LoRA implementation, OLoRA decomposes original weights into their $\mathbf{Q}$ and $\mathbf{R}$ parts, and then uses the first `rank` rows of $\mathbf{R}$ and the first `rank` columns of $\mathbf{Q}$ to initialize $\mathbf{A}$ and $\mathbf{B}$, respectively. This results in significantly faster convergence, more stable training, and superior performance. + +## Quick start +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +dataset = load_dataset("imdb", split="train[:1%]") +lora_config = LoraConfig( + init_lora_weights="olora" +) +peft_model = get_peft_model(model, lora_config) +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("olora-opt-350m") +``` + +There is no additional change needed to your standard LoRA procedure, except for specifying `init_lora_weights = "olora"` option in your lora configuration. + +Additionally you can refer to olora finetuning script. +Run the script simply by running: +```bash +python3 examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m +``` +OLoRA also supports quantization. To use 4-bit quantization try: +```bash +python3 examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m --quantize +``` +or you can just pass a quantized model without the quantize flag. + +If you want to run DDP by [accelerate](https://huggingface.co/docs/accelerate/en/index), please run `accelerate config` to set your ddp config, and run: +```bash +accelerate launch examples/olora_finetuning/olora_finetuning.py --base_model facebook/opt-350m +``` +please add `--device_map cpu` if you want to run finetune on CPU. + +If you want to train a quantized model like AWQ and GPTQ which do not support olora init method, please pass `--init_lora_weights gaussian`. For example: +```bash +python3 examples/olora_finetuning/olora_finetuning.py --base_model hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4 --init_lora_weights gaussian + +``` + + +## Use the model +You can load and use the model as any other 🤗 PEFT model +```python +from peft import PeftModel +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +olora_model = PeftModel.from_pretrained(model, "olora-opt-350m") +``` + +## OLoRA and LoRA +OLoRA differs from LoRA in that it mutates the original weights. To utilize multiple adapters simultaneously, you can leverage the `path_initial_model_for_weight_conversion` option. Below is a simple template illustrating how to convert OLoRA to conventional LoRA: +```python +base_model = AutoModel.from_pretrained("facebook/opt-350m") +olora_config = LoraConfig( + ... + init_lora_weights = "olora" # Initialize the model with OLoRA +) +olora_model = get_peft_model(base_model, olora_config) +init_path = +olora_model.save_pretrained(init_path) # Save the model *before* performing any training + +# Train the model +train(olora_model) # Your training loop + +#Save the model after training +olora_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion=init_path) +``` +After completing training, you can save and convert your OLoRA model to a conventional LoRA model by setting `path_initial_model_for_weight_conversion` to `init_path`, that is the path of your untrained OLoRA model. This conversion enables you to use multiple adapters with your LoRA model. Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. + +## Citation +``` +@misc{büyükakyüz2024olora, + title={OLoRA: Orthonormal Low-Rank Adaptation of Large Language Models}, + author={Kerim Büyükakyüz}, + year={2024}, + eprint={2406.01775}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` diff --git a/peft/examples/olora_finetuning/olora_finetuning.py b/peft/examples/olora_finetuning/olora_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..e3d41f3c07b553c6413d2c329961d603d4637316 --- /dev/null +++ b/peft/examples/olora_finetuning/olora_finetuning.py @@ -0,0 +1,199 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Optional + +import torch +import transformers +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed + +from peft import ( + LoraConfig, + get_peft_model, +) + + +def train( + base_model: str = "path/to/model", + data_path: str = "yahma/alpaca-cleaned", + output_dir: str = "olora", + batch_size: int = 16, + num_epochs: int = 1, + learning_rate: float = 3e-4, + cutoff_len: int = 256, + val_set_size: int = 16, + quantize: bool = False, + eval_step: int = 100, + save_step: int = 100, + device_map: str = "auto", + lora_r: int = 32, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_target_modules: list[str] = None, + torch_dtype: str = "float16", + init_lora_weights="olora", + seed: Optional[int] = None, +): + # Set device_map to the right place when enabling DDP. + world_size = int(os.environ.get("WORLD_SIZE", 0)) or int(os.environ.get("PMI_SIZE", 0)) + if world_size > 1 and device_map != "cpu": + from accelerate import Accelerator + + device_map = {"": Accelerator().process_index} + # Set seed + if seed is not None: + set_seed(seed) + model_kwargs = {"torch_dtype": getattr(torch, torch_dtype), "device_map": device_map} + if quantize: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs) + + tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) + # For some tokenizer with no pad token like llama + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize(prompt, add_eos_token=True): + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + def generate_and_tokenize_prompt(example): + full_prompt = generate_prompt(example) + tokenized_full_prompt = tokenize(full_prompt) + return tokenized_full_prompt + + config = LoraConfig( + r=lora_r, + lora_alpha=lora_alpha, + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + init_lora_weights=init_lora_weights, + ) + model = get_peft_model(model, config) + + data = load_dataset(data_path) + + train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42) + train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) + + trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=batch_size, + warmup_steps=100, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + logging_steps=100, + optim="adamw_torch", + eval_strategy="steps", + save_strategy="steps", + eval_steps=eval_step, + save_steps=save_step, + output_dir=output_dir, + save_total_limit=3, + load_best_model_at_end=True, + ddp_find_unused_parameters=False if world_size > 1 else None, + ), + data_collator=transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + trainer.train() + model.save_pretrained(output_dir) + + +def generate_prompt(example): + return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. + ### Instruction: + {example["instruction"]} + ### Response: + {example["output"]}""" + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--base_model", type=str, default="path/to/model") + parser.add_argument("--data_path", type=str, default="yahma/alpaca-cleaned") + parser.add_argument("--output_dir", type=str, default="olora") + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--num_epochs", type=int, default=1) + parser.add_argument("--learning_rate", type=float, default=3e-4) + parser.add_argument("--cutoff_len", type=int, default=256) + parser.add_argument("--val_set_size", type=int, default=16) + parser.add_argument("--quantize", action="store_true") + parser.add_argument("--eval_step", type=int, default=100) + parser.add_argument("--save_step", type=int, default=100) + parser.add_argument("--device_map", type=str, default="auto") + parser.add_argument("--lora_r", type=int, default=32) + parser.add_argument("--lora_alpha", type=int, default=16) + parser.add_argument("--lora_dropout", type=float, default=0.05) + parser.add_argument("--lora_target_modules", type=str, default=None) + parser.add_argument("--torch_dtype", type=str, default="float16") + parser.add_argument("--init_lora_weights", type=str, default="olora") + parser.add_argument("--seed", type=int, default=None) + + args = parser.parse_args() + + train( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device_map=args.device_map, + lora_r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + torch_dtype=args.torch_dtype, + init_lora_weights=args.init_lora_weights, + seed=args.seed, + ) diff --git a/peft/examples/pissa_finetuning/README.md b/peft/examples/pissa_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4960dd500dedfc178c711ba87a2747f9220c66a9 --- /dev/null +++ b/peft/examples/pissa_finetuning/README.md @@ -0,0 +1,131 @@ +# PiSSA: Principal Singular values and Singular vectors Adaptation +## Introduction ([Paper](https://huggingface.co/papers/2404.02948), [code](https://github.com/GraphPKU/PiSSA)) +PiSSA represents a matrix $W\in\mathbb{R}^{m\times n}$ within the model by the product of two trainable matrices $A \in \mathbb{R}^{m\times r}$ and $B \in \mathbb{R}^{r\times n}$, where $r \ll \min(m, n)$, plus a residual matrix $W^{res}\in\mathbb{R}^{m\times n}$ for error correction. Singular value decomposition (SVD) is employed to factorize $W$, and the principal singular values and vectors of $W$ are utilized to initialize $A$ and $B$. The residual singular values and vectors initialize the residual matrix $W^{res}$, which keeps frozen during fine-tuning. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements. + +## Quick Start +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") +tokenizer.pad_token_id = tokenizer.eos_token_id +lora_config = LoraConfig( + # init_lora_weights="pissa", # Configure the initialization method to "pissa", which may take several minutes to execute SVD on the pre-trained model. + init_lora_weights="pissa_niter_4", # Initialize the PiSSA with fast SVD, which completes in just a few seconds. +) +peft_model = get_peft_model(model, lora_config) + +peft_model.print_trainable_parameters() + +dataset = load_dataset("imdb", split="train[:1%]") + +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + args=training_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("pissa-llama-2-7b") +``` +When utilizing fast SVD, reducing the rank and the number of iterations decreases the time required. However, this approach leads to higher errors in the computed matrices $A$ and $B$. To preserve the model's initial capabilities, we calculate the residual matrix by $W^{res} = W - BA$. Even with potential errors in $A$ and $B$, the sum of $W^{res}$ and $BA$ accurately equals $W$. + + +To utilize the fine-tuned PiSSA modules, simply run the following command: +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto" +) +# Performs SVD again to initialize the residual model and loads the state_dict of the fine-tuned PiSSA modules. +peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b") +``` + +## Advanced Usage + +### Access the preprocessed models +We recommend downloading decomposed models directly from the [Hugging Face Collections](https://huggingface.co/collections/fxmeng/pissa-661ce700721235e542a5d7a8) instead of performing SVD every time. +If the existing models do not meet your needs, apply PiSSA initialization to a pre-trained model and store the decomposed model locally: +```bash +python preprocess.py \ + --base_model_name_or_path meta-llama/Llama-2-7b-hf \ + --init_lora_weights pissa \ + --output_dir pissa-llama-2-7b-r32-alpha-32 \ + --lora_r 32 \ + --lora_alpha 32 \ + --lora_dropout 0 \ + --bits bf16 +``` + +### Convert PiSSA to LoRA +The main advantage of PiSSA is concentrated during the training phase. For a trained PiSSA adapter, we recommend converting it equivalently to the LoRA adapter for using and sharing. +```python +# The fine-tuned matrices $A$ and $B$ in PiSSA adapter is saved and should be combined with the residual model. +peft_model.save_pretrained(output_dir) +# Given the matrices $A_0$ and $B_0$, initialized by PiSSA and untrained, and the trained matrices $A$ and $B$, +# we can convert these to LoRA by setting $\Delta W = A \times B - A_0 \times B_0 = [A \mid A_0] \times [B \mid -B_0]^T = A'B'$. +peft_model.save_pretrained(output_dir, path_initial_model_for_weight_conversion="pissa_init") + +``` +This conversion enables the loading of LoRA on top of a standard base model: + +```python +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", torch_dtype=torch.bfloat16, device_map="auto" +) +# No SVD is performed during this step, and the base model remains unaltered. +peft_model = PeftModel.from_pretrained(model, "pissa-llama-2-7b-lora") +``` +Utilizing the converted LoRA does not require modifying the parameters of the base model. When multiple converted LoRAs are needed simultaneously, each adapter operates independently without interference, allowing for the adapters to be freely deleted or added. + +Note that this conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. + +### Fine-tune in 4-bit or 8-bit +If quantization fine-tuning is desired, it is necessary to first decompose the original model at full precision and then reload the residual model in either 4-bit or 8-bit configurations. +```shell +python pissa_finetuning.py \ + --residual_model_name_or_path fxmeng/pissa-llama-2-7b-r16-alpha-16 \ + --output_dir output/pissa-llama-2-7b-r16-alpha-16-metamath-10k \ + --bits nf4 \ + --data_path meta-math/MetaMathQA \ + --dataset_split train[:100000] \ + --dataset_field query response \ + --bf16 True \ + --num_train_epochs 1 \ + --per_device_train_batch_size 32 \ + --gradient_accumulation_steps 4 \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 1 \ + --logging_steps 1 \ + --learning_rate 2e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --tf32 True \ + --report_to none \ + --convert_pissa_to_lora +``` + +This approach ensures the preservation of high-frequency, out-of-distribution parameters in the low-rank PiSSA modules, resulting in reduced quantization errors during the quantization of the residual model. + +## Citation +``` +@article{meng2024pissa, + title={PiSSA: Principal Singular Values and Singular Vectors Adaptation of Large Language Models}, + author={Meng, Fanxu and Wang, Zhaohui and Zhang, Muhan}, + journal={arXiv preprint arXiv:2404.02948}, + year={2024} +} +``` diff --git a/peft/examples/pissa_finetuning/pissa_finetuning.py b/peft/examples/pissa_finetuning/pissa_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..36d4ba04f01b0388cafd6af10ed7cb726eeb2dec --- /dev/null +++ b/peft/examples/pissa_finetuning/pissa_finetuning.py @@ -0,0 +1,150 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from dataclasses import dataclass, field +from typing import Optional + +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser +from trl import SFTConfig, SFTTrainer + +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training + + +@dataclass +class ScriptArguments(SFTConfig): + # model configs + base_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "The name or path of the fp32/16 base model."} + ) + residual_model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The name or path of the fp32/16 residual model. (`['fxmeng/pissa-llama-2-7b-r16-alpha-16']`)" + }, + ) + bits: str = field(default="fp32", metadata={"help": "(`['fp4', 'nf4', 'int8', 'bf16', 'fp16', fp32]`)"}) + init_lora_weights: str = field(default="pissa", metadata={"help": "(`['gaussian', 'pissa', 'pissa_niter_4']`)"}) + lora_r: int = field(default=16) + lora_alpha: int = field(default=16) + lora_dropout: float = field(default=0) + convert_pissa_to_lora: bool = field(default=False) + merge_and_save: bool = field(default=False) + # dataset configs + data_path: str = field(default="imdb", metadata={"help": "Path to the training data."}) + dataset_split: str = field(default="train[:1%]", metadata={"help": "(`['train', 'test', 'eval']`):"}) + dataset_field: list[str] = field(default=None, metadata={"help": "Fields of dataset input and output."}) + + +parser = HfArgumentParser(ScriptArguments) +script_args = parser.parse_args_into_dataclasses()[0] +print(script_args) + +print(f"Load pre-processed residual model in {script_args.bits} bits.") +if script_args.bits in ["nf4", "fp4", "int8"]: + quantization_config = BitsAndBytesConfig( + load_in_4bit=(script_args.bits == "nf4" or script_args.bits == "fp4"), + load_in_8bit=script_args.bits == "int8", + bnb_4bit_quant_type=script_args.bits, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + res_model = AutoModelForCausalLM.from_pretrained( + script_args.residual_model_name_or_path, quantization_config=quantization_config, low_cpu_mem_usage=True + ) + res_model = prepare_model_for_kbit_training(res_model) + print("Wrapping the residual model with PiSSA.") + peft_model = PeftModel.from_pretrained( + res_model, script_args.residual_model_name_or_path, subfolder="pissa_init", is_trainable=True + ) + tokenizer = AutoTokenizer.from_pretrained(script_args.residual_model_name_or_path) + +elif script_args.residual_model_name_or_path is not None: + res_model = AutoModelForCausalLM.from_pretrained( + script_args.residual_model_name_or_path, + torch_dtype=( + torch.float16 + if script_args.bits == "fp16" + else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32) + ), + device_map="auto", + ) + print("Wrapping the residual model with PiSSA.") + peft_model = PeftModel.from_pretrained( + res_model, script_args.residual_model_name_or_path, subfolder="pissa_init", is_trainable=True + ) + tokenizer = AutoTokenizer.from_pretrained(script_args.residual_model_name_or_path) + +elif script_args.base_model_name_or_path is not None: + print( + f"No available pre-processed model, manually initialize a PiSSA using {script_args.base_model_name_or_path}." + ) + model = AutoModelForCausalLM.from_pretrained( + script_args.base_model_name_or_path, + torch_dtype=( + torch.float16 + if script_args.bits == "fp16" + else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32) + ), + device_map="auto", + ) + tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path) + tokenizer.pad_token_id = tokenizer.eos_token_id + lora_config = LoraConfig( + r=script_args.lora_r, + lora_alpha=script_args.lora_alpha, + init_lora_weights=script_args.init_lora_weights, + lora_dropout=script_args.lora_dropout, + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + peft_model = get_peft_model(model, lora_config) + +print(peft_model) +peft_model.print_trainable_parameters() + +print(f"Training PiSSA with trl on the {script_args.data_path}[{script_args.dataset_split}] dataset.") +dataset = load_dataset(script_args.data_path, split=script_args.dataset_split) +dataset = dataset.map( + lambda example: { + "text": f"### USER: {example[script_args.dataset_field[0]]}\n### ASSISTANT: {example[script_args.dataset_field[1]]}" + } +) + +trainer = SFTTrainer( + model=peft_model, + args=script_args, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +trainer.save_state() +############################## Upon training completion, convert and save PiSSA in LoRA format ############################## +if script_args.convert_pissa_to_lora: + peft_model.save_pretrained( + os.path.join(script_args.output_dir, "pissa_lora"), + path_initial_model_for_weight_conversion=os.path.join(script_args.residual_model_name_or_path, "pissa_init"), + ) +else: + peft_model.save_pretrained( + os.path.join(script_args.output_dir, "pissa_ft"), + ) + +if script_args.merge_and_save: + model = peft_model.merge_and_unload() + model.save_pretrained(os.path.join(script_args.output_dir, "pissa_merged")) + tokenizer.save_pretrained(os.path.join(script_args.output_dir, "pissa_merged")) diff --git a/peft/examples/pissa_finetuning/preprocess.py b/peft/examples/pissa_finetuning/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..57eed4420c68388dd944aaaf99e8354764da9194 --- /dev/null +++ b/peft/examples/pissa_finetuning/preprocess.py @@ -0,0 +1,69 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import LoraConfig, get_peft_model + + +parser = argparse.ArgumentParser(description="Merge Adapter to Base Model") +parser.add_argument( + "--base_model_name_or_path", + help="The name or path of the fp32/16 base model.", +) +parser.add_argument("--output_dir", type=str, help="The directory to save the PiSSA model.") +parser.add_argument("--bits", type=str, default="bf16", choices=["bf16", "fp16", "fp32"]) +parser.add_argument( + "--init_lora_weights", type=str, default="pissa", help="(`['pissa', 'pissa_niter_[number of iters]']`)" +) +parser.add_argument("--lora_r", type=int, default=128) +parser.add_argument("--lora_alpha", type=int, default=128) +parser.add_argument("--lora_dropout", type=int, default=0) +script_args = parser.parse_args() +print(script_args) + +model = AutoModelForCausalLM.from_pretrained( + script_args.base_model_name_or_path, + torch_dtype=( + torch.float16 + if script_args.bits == "fp16" + else (torch.bfloat16 if script_args.bits == "bf16" else torch.float32) + ), + device_map="auto", +) +tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name_or_path) +tokenizer.pad_token_id = tokenizer.eos_token_id +lora_config = LoraConfig( + r=script_args.lora_r, + lora_alpha=script_args.lora_alpha, + init_lora_weights=script_args.init_lora_weights, + lora_dropout=script_args.lora_dropout, + target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], + bias="none", + task_type="CAUSAL_LM", +) +peft_model = get_peft_model(model, lora_config) + +# Save PiSSA modules: +peft_model.peft_config["default"].init_lora_weights = True +peft_model.save_pretrained(os.path.join(script_args.output_dir, "pissa_init")) +# Save residual model: +peft_model = peft_model.unload() +peft_model.save_pretrained(script_args.output_dir) +# Save the tokenizer: +tokenizer.save_pretrained(script_args.output_dir) diff --git a/peft/examples/poly/peft_poly_seq2seq_with_generate.ipynb b/peft/examples/poly/peft_poly_seq2seq_with_generate.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d6768d01fc0730a357a14e4bdce96889efee028f --- /dev/null +++ b/peft/examples/poly/peft_poly_seq2seq_with_generate.ipynb @@ -0,0 +1,14776 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2edec24d8563b583", + "metadata": { + "collapsed": false, + "execution": { + "shell.execute_reply.end": "2023-12-22T03:34:15.998083Z", + "shell.execute_reply.started": "2023-12-22T03:34:15.994854Z", + "to_execute": "2023-12-22T03:34:15.875Z" + }, + "libroFormatter": "formatter-string" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: CUDA_VISIBLE_DEVICES=0 # force using CUDA GPU device 0\n", + "env: ZE_AFFINITY_MASK=0 # force using Intel XPU device 0\n", + "env: TOKENIZERS_PARALLELISM=false\n" + ] + } + ], + "source": [ + "%env CUDA_VISIBLE_DEVICES=0 # force using CUDA GPU device 0\n", + "%env ZE_AFFINITY_MASK=0 # force using Intel XPU device 0\n", + "%env TOKENIZERS_PARALLELISM=false" + ] + }, + { + "cell_type": "markdown", + "id": "95b4cfd741795038", + "metadata": { + "id": "95b4cfd741795038", + "libroFormatter": "formatter-string" + }, + "source": [ + "## Initialize PolyModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a5c7a99-5208-4d22-ac15-bacebe1b52f9", + "metadata": { + "execution": { + "shell.execute_reply.end": "2023-12-22T03:34:29.137789Z", + "shell.execute_reply.started": "2023-12-22T03:34:18.146604Z", + "to_execute": "2023-12-22T03:34:18.025Z" + }, + "id": "1a5c7a99-5208-4d22-ac15-bacebe1b52f9", + "libroFormatter": "formatter-string" + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import (\n", + " AutoModelForSeq2SeqLM,\n", + " AutoTokenizer,\n", + " default_data_collator,\n", + " Seq2SeqTrainingArguments,\n", + " Seq2SeqTrainer,\n", + ")\n", + "from datasets import load_dataset, concatenate_datasets\n", + "from peft import PolyConfig, get_peft_model, TaskType, PeftModel, PeftConfig\n", + "\n", + "model_name_or_path = \"google/flan-t5-xl\"\n", + "\n", + "r = 8 # rank of lora in poly\n", + "n_tasks = 4 # number of tasks\n", + "n_skills = 2 # number of skills (loras)\n", + "n_splits = 4 # number of heads\n", + "\n", + "batch_size = 8\n", + "lr = 5e-5\n", + "num_epochs = 8" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "89a1d2c6-0d35-4254-b9fb-035a426d86ae", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 241, + "referenced_widgets": [ + "dc5d4672fcd149239cfe1a837094ce53", + "eded01d7629e4a4faad592e8e20a3ca3", + "5d1e94d40f514faaa5819096f167d29c", + "f98f73664a974ae7804e494425fbe20d", + "1c0bd751a3294b8ea0cf828866169121", + "6c3ed2de06fe40c09315ff72d43d5c8c", + "2e3d6b5d46db4295829002fc311a9c74", + "5ff0d4da7342457089f0961b189307f4", + "a08c4e6628bd440fb31eebbb2693f327", + "379357ab63f5479fad469c181b054bb0", + "f860e1c3467348f0802b733fbef45c15", + "567e165c27a4494bbf4810ecb7de40cf", + "015fd47fdbdf47c5a619eff218052b45", + "ec33a4325b6f4dcfb8a9fa4c80a5c704", + "3241189c875a471ab0831f0f4411d2d3", + "268fe971a0bc45c6b7c37586e0f9da49", + "8851d4a04cb9410c849b6606a812c52b", + "3b5ab7d9f27944d8ae1b172231c9c6fc", + "85f57b44dbe442a4952c65e1db4c1176", + "3f173a7293cd4ff8a54da8c8174cfb43", + "40ac1e38c100435fbe95b669c69a31c5", + "a634013728be457ba590aa333908addd", + "376242d1cfd74c88aaeaa76a6813d855", + "a17443b5713d4b60aeb85da3adce6cf2", + "91f821fb888046b6a2f8ade2cc58db2d", + "4bea407148e846babefdc88eff8a9131", + "26a75e6f6628472b91f3214505afa935", + "c7c0b0fd45dc448eb9456f58e36fb3bb", + "a14f26db56d04b8b840a9ce366e913e6", + "09fa4b156f174dbcacdf976f2b39a280", + "c9ca89486def4220967599e5b159b980", + "558b98eb76654045a5eae24170a5dc9c", + "7edf2ac4dd264843a7838a0130668757", + "c3aa97f46a60409091dc4d33a946c6d3", + "e6315c5d217b4922b461c9ac22528e62", + "04c34c92e4374c50bd0636c72953a8ba", + "32a6ac79c27e47c1a4b32098bfe25807", + "bb99715a25d94422b0048de94f2fe563", + "637bbd213f3345178742523d055993e6", + "6113f1920c5743aa8f2c6cc9739029e1", + "24bbd7b810b34c4c9baeed628961c64b", + "1c63e99470824a3aa0f98a94862733d5", + "98c677014f1a48ac804cec0714a22172", + "a4097270b9b947b0ad0b3b5d217eecc0", + "8eaed8cbbf1943328dc80fc43bd5b97c", + "6b1972a032af41de9bf99a6582c53f39", + "b4eb16a8153048ea9aa5c9d43b44820c", + "9bd66a63faf9416d9e774a5d8221c5f5", + "40b859a2fd68457db691bb5e7eb23591", + "533151b377d64d3484772b3173dab306", + "2cd302d306e3440dac4b70fc46741544", + "41b36d52e98249b1b506d369d2d8e994", + "ac83130fdd374b7c8f41e0f8f011ecae", + "66b0e949143e46faab77458a49a9fe1a", + "abd34fa3e94c49869ea7cf514dba6d1d", + "8ffe87ece7e54294a160540fbbbe124b", + "9c3c68da285449958a3d8745bbc50305", + "ae517eef5a004b16b4ae34cdf2aa851e", + "08a572aefb63488d8125ae3b881c0729", + "f2131e286f704514a61b5af0785dde8b", + "43d9b3de4a6949f787d9733d1ae4d18e", + "06716294f2244cc48f78af918cc063f2", + "e69b0005e91a478297d17e4089cda650", + "f92f9afc2c694f0cbcdf4ebcca98221e", + "7ffe5fd0a64c40cebc784eca83154069", + "d622b006621e4110a157fb4cb43c9762", + "874e05e0b861466ba57a08d8f5a5b7ee", + "8ebe69a07de64c3cb6dfd6433e222186", + "2aceeebfd0dc42fcbbc1b3a7e1f54c56", + "93c6f7c0d1ba49a295ae60a73bf509a9", + "6c3ebb812cfd493bb954a6b1d7455c72", + "a27edbdb4c824979b1b56e8fbd867595", + "5bdf79c178074ebf8757936190bc37b3", + "3c076081fd7942e184f8d4f171a17e1c", + "0a03bee83ddf4ad297bfdc9b4de3b075", + "6f59ae0a20cf4cc5859925e3259291a7", + "49ac9897f49843fd8c5fed4bcdfdbb56" + ] + }, + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:33.229420Z", + "shell.execute_reply.started": "2023-12-22T03:34:37.266443Z", + "to_execute": "2023-12-22T03:34:37.242Z" + }, + "id": "89a1d2c6-0d35-4254-b9fb-035a426d86ae", + "libroFormatter": "formatter-string", + "outputId": "fc90c2cc-9cab-40ed-bf4a-d76bec85b72f" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 22.43it/s]\n" + ] + } + ], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)\n", + "base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, trust_remote_code=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "29d701a4-7a4f-4eae-84bd-9e3a02b7ffca", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:33.396336Z", + "shell.execute_reply.started": "2023-12-22T03:35:33.250286Z", + "to_execute": "2023-12-22T03:35:33.272Z" + }, + "id": "29d701a4-7a4f-4eae-84bd-9e3a02b7ffca", + "libroFormatter": "formatter-string", + "outputId": "63898f68-926e-40c4-ca13-ffd1df32fcce" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 9,441,792 || all params: 2,859,198,976 || trainable%: 0.3302\n" + ] + } + ], + "source": [ + "peft_config = PolyConfig(\n", + " task_type=TaskType.SEQ_2_SEQ_LM,\n", + " poly_type=\"poly\",\n", + " r=r,\n", + " n_tasks=n_tasks,\n", + " n_skills=n_skills,\n", + " n_splits=n_splits,\n", + ")\n", + "\n", + "model = get_peft_model(base_model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "markdown", + "id": "aa695c2d-cf9c-432c-ab74-7e89f816ba13", + "metadata": { + "id": "aa695c2d-cf9c-432c-ab74-7e89f816ba13", + "libroFormatter": "formatter-string" + }, + "source": [ + "## Prepare datasets\n", + "\n", + "For this example, we selected four `SuperGLUE` benchmark datasets: `boolq`, `multirc`, `rte`, and `wic`, each with a training set of 1,000 examples and an evaluation set of 100 examples." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d0b36e7eff50657c", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "d6250bff76d7454a8216572ab28e4a72", + "384d10ea2a354f24bae33c3a1d564b82", + "a2deecc9aa3d42d381d78199f6e29d1c", + "17fc618034bf4aadaef811b0e7c80eed", + "6757bc0834fc4e69b7b588ae6de14ec9", + "b9ec517b4b084d548525ac41381ef69e", + "6f3679fe9b60498da864bda9ba6d899e", + "ef16f8bac38044c3b6a092caf5da320b", + "19e50ecdda3b493184611d97724ac1fc", + "c2ed87d5599a467bba084cddb9e40713", + "2cfc492ab0ed454dbf2c4da18cd24d02", + "91e6e0685a4c4d26b6154d3ed18418eb", + "ba1864322c0d49fd915e9dcc2469ef6f", + "67b108def57749edb2564b3e507959a3", + "c04a17f40c974f378c60858473f49fd0", + "4189c6e9c59e44d3a776b49c38cc8f06", + "7550307b4e894844b8d032df7eea6d82", + "bcf20733fb504a71be5cf0455928b587", + "cc6c1b2d4fcb4ffea016a139738e1ead", + "6bd3da08b5074e81bffbfe6d92b8ce8b", + "03d340641a414362b0356e8178148d9a", + "61fec3b2596c4803924ed1fb087d52d1", + "4fc54c5844aa44f2b335824c3544a334", + "736443c7e26642379ca66ed3e5dd34cb", + "3c3d747638004a08a898cff7c6f59acc", + "1cb2dcb242334f46b7f195929dd1f341", + "f577bbac4eab439b9ccea0a49eb99d86", + "8822d3a8fa794fc0addb5885a862d205", + "10436da727ec45c8a5e8b783696636d5", + "0622e1de75f34da590f241232613cf5e", + "5b868029728541dc9da977312da38cf0", + "33add0384c36462ab44fe3e0b03f63c7", + "e227fb95b00b4af8b82286c75db84611", + "54d4afa42e9346578c0a1a193ee8caea", + "2918f1fd9e104c09967d698e11728785", + "170853712c0c4a8d997696f74090d7c6", + "fcdb61acbf0d470f881ba8f283360e0f", + "4803c9aece3346488295338254217aff", + "d725549ca34a4d54ae684e7e4741be29", + "d94392afd01246ff942af838a995379e", + "7eaee1cd25d0442092846922cdd6c413", + "483b9c219aa94ea1952e3534a02395aa", + "10d7a41588744be1b29678b4a9dfdd27", + "c4fce7e5a2b44835ab8723e0022d1e50", + "ba3a7258734b4edb86b8eef074d65222", + "4b3dc87d00ed42b0956d0bfa39bd466f", + "2c5eaf38e66d492d8661852cacc4e527", + "b7f169f931074d1283cbfe912f11ba98", + "e86771ea303a4b1b86ecf5128f3ea421", + "b3a18689eefa4660997034094df0df04", + "91cb4404dd794d22b2bbaf31eee207b5", + "d8594695c03e4fb7965dcbe04074d4eb", + "f972a2e10dde405c8aec8f7cd1be4317", + "6c9a5a39cb4841ba8a0b93283be0cac2", + "2deb046362ed4570a3f550f4f288529e", + "2cdddd0398e045a6a13124bf6fd85506", + "a1f5618e59d148409d6ccf4bfffca2fa", + "c870763add1745d9acfc2762f468c984", + "be70aecc5b294c4c93b0dcc09d6d1cb3", + "37b18b2ae9504b6c91066798a19a1319", + "b17eef10573f44689ce6add6231eaa19", + "3e8f08e000f248b59331d2430bbc8e3c", + "d8886d5af17f4468a26554831c9c05f1", + "b1cad4191755493893bbb46dcf27e03b", + "589120da6f464686bbeff0d44643d17d", + "ea89ea173ff3482c8a9c91dfb15946b2", + "b38693117df44071b7baaf123215ea60", + "9d716c9e43e04a6c9496620633ca28be", + "4de322f2413f44bcb03d41dcc8ff1963", + "3087335b98964b9eb4da474487ca4864", + "c010fad90578489cbfeb0764e3a11286", + "627f32f04b544b4db834b79645f36733", + "7d39222af2474a68b0db99f407ccf380", + "775687d3962242d6aff3feb0627754a9", + "92bb9917ab194a1eb1dc6fc5c4c4195d", + "43717a0ae4c043f9947d8fd844d71997", + "3f0008931054433c838a6633ca1347e6", + "da0dfe11648d4ae8a70852ce1fac87b0", + "29ad37e81b7f44a9aafa982b52f05a7a", + "bb468a6fe3f04692a211d5519aec455a", + "b1ad08dbe61b4064985ecdaa119870d8", + "d91093e80b814a018edaabe49f529ef5", + "1e3f013edc6341a0837af33ff4866d0b", + "1ed6f4595ec540729d776a81db96c403", + "cd7371ff8504454292559c18adb76645", + "d4994c6d7fa240d0ac6bb31f5c835192", + "ee1a9269b6c843e28cc49f3b5f17da96", + "a12a6638b2a54e88a020be42c139646e", + "2f2dc993705b447aa771cf0cc13c3b1d", + "69dffb139cab46b1b93bef960f702655", + "e7dc30a09a64401393e43618b51059de", + "75c397c506a04d0d9ca62e8d7f990813", + "eb7e509acfea4e1bb2f59b2fde11603d", + "b75a3d92aaa64a3098c6e1aabbc50856", + "3b056671c3fd41aeb4d6da821d562b95", + "9ac87f9e5b7847aaa90e3208ad405c23", + "cb21cadacb294854b304af8df2157299", + "30b3667258174beaa01322ffa055759b", + "48cb432d95dc49deae6077fb5c76bec3", + "763eec2b33234fd5ac192f25489b2844", + "f7fde1c95e3946659c6208fc52c254e9", + "6e1cdd75ae2246278c80f8e5d4e340b7", + "6af159109303490eab7192815fce0d6b", + "315575253cb9433d81f7d26770907f29", + "daf61ecd65bd41d9829e8a1872b82f33", + "54823295494d441fb9f26a70fb2c3973", + "df13aabfde0140d68db8b5a69759091c", + "cadbbc94bfd24daa933bb7d188dcdf92", + "f271a7dc607b4c05b02f6c5621203bd6", + "4e60ffab53dd4aba9e094098ed5297e6", + "a91dd2c07c51483eb326d010a82e2920", + "4188c097387a408dae680f67bd97752b", + "2a6cf3f2b4c349a9adc26351c2f0b222", + "8efe1cb67f30446a8fdaaf96782e843d", + "bdecbe50f693451b87bd331fd9e684ba", + "ff00f4c2c63a467098b119ae2259f529", + "d6754db1364144e69e3ab320aa1faeb9", + "9f1feddbe0d449a0b93fc5b1027e4319", + "1507ea5f89534b10856b99488ed5da65", + "f69205c589cf42829eea248f378a1436", + "85843fd6df264d25a4642cbeee260459", + "c2a415c28fe0418bb6032d4b91efbb07", + "b02796bf6e5547cf9418d109ad772537", + "de7a527901014a6b8abf6b714bd09535", + "143eaab6e65f4c8eb7a4314cadf323ba", + "e0603f00ceb2468692a36abd7bafaba8", + "8abf122ed835496aa09945ba8edd4688", + "663b9bd2f1af4df4b757624f53c2f2b8", + "c352adea84b043db8d43eb1c36d4bd4f", + "8fda102684834d21b4efbb472823ced9", + "ee7eccaca57b4460becfe0d5d5afb3f9", + "554a1ef44aea4ff484f0944878bb58a2", + "8ae15293ec5e4296a625087e7d965249", + "db067e50373840b19d6925deb950a20a", + "ab409caa3be24c18becaf9146b1ae69c", + "63236709413940f59d2622f2927c8d55", + "3e6d67d246e54fa497f4398e3aeddb00", + "118967bdd4a348858cc7572d36c1b736", + "8cc3c0558720400e9ed89170883f6370", + "59ee159eec154095a368efceb9d1e042", + "f78c80dd733d48a687d8a47bfc792ea4", + "9da9d7eb9a97495194a3ac4a2786e1de", + "3f1d345037604e01911ef344f3b51742", + "eeff19b29a6a4c6d9ae34e365c78c310", + "a098c442cc8149a5aa562c86fc64528e", + "badae7117ab644ffa80d099c17397329", + "13ccc23d506248d5b66ffe7732ead149", + "16938b11881741af8e6633094a4402dc", + "8d796692add94119a4e9fdc6530a6878", + "9202902fd37446cda1678c0d83e0c641", + "2cd168e5e3c4481ba151aa8a655e7ce8", + "f773b949a9cf46ca9fd56476398a3191", + "c6a00e1b00684bb7930fb27d6499932e", + "4395227decf642f7b8fbc6616f9ec826", + "205a7844670249bf83d468fe3af0e139", + "28a78fb894a4413e960c1e40d7df8173", + "46bd38ac919e4e66a72226c1f0da67d6", + "9557a05279544cd8a5f2ba4d3429f576", + "2e0e7f437b5d4e4086b38ee6da51dc4f", + "73a0ea365433404babb83a2d1caa9c66", + "855a155046904aaa9b91b01dd6a86088", + "d49e405525ec467bb7a69a9aaedf82d9", + "b8ccdebb7b11490e8222ff79ecfc9a33", + "9edc1d5792644f79bc04d853b13dac46", + "1b78ff7e32254777abe2c802b6879b9c", + "32168837680e41eaad4e5e4cdf09877d", + "319bf5c6332f41958974d9c3af87a382", + "c1e065fa36344f509e7863c3ec0428b8", + "5b9510c694a24afbb1c8318fea1a1bc5", + "f12d118a6b3f46b580fbe2018f4cf5e9", + "726e82f2b6c94e8eb5620c18872aabb6", + "12bb708857e84e5b893ca3e9ff176082", + "53acddb088564a73aef61618797bfe85", + "fb208f3792174ac2bdfb077450b2218f", + "0fa8ddd8da924c089221611c98e7da6e", + "b4a5fdee693d455686507306da804b17", + "045c5ac6981440c996ac7dda054fc112", + "f6a6f03ea7f140189a277742ff7082f8", + "ecff5b6dfb784c96b69d1a39b7acb171", + "43a32f261eaf42af978a6bf98502b1fc", + "8b7bb3fb502f4c3185709d6c40638d70", + "bf1d0b17974049c6ac5653ac18f1169c", + "32672cb3395045bcb9c2d370032356cc", + "4728eddf9fcc49d68d71a30379f08335", + "8676e2232dc242c39d4f19b0eea90dff", + "167e67c018e441d8baab4127b25773c6", + "88405a75b92743e589f424ee8c4d4d79", + "b5e3536d816c45488bb83336eaa5d53f", + "b9bed2c861df4c019ab4fff46b11a1a3", + "35bf380a7c6243859a459560288ffe49", + "b65af12dbf9143778412adb7b4c0bfdd", + "cb2433a0096845468b26c4bbde625ed8", + "8d4df1cf62d2427b8e850b031164ff97", + "0fd8ca256b8b49e1906a2a8e21156164", + "9801a82eda354815b6b3abbc8d1e0140", + "b10768b6bc654da8b822b4878889639b", + "c56ea889f51848e4aed83bcc46c83395", + "c75508086f6f4406a0aa9ce5a391e0ee", + "d92dbe59ace74f598efc7fbedb4c5ee6", + "675c937cf0ef4bf582f4bf90df6fa28e", + "e1cf760846bd4ba988c29665a6593220", + "768735ea9663429ba9f24efd86682f71", + "4d2a10e9307a47f4a1cbf512380c65bc", + "e8738d4181b04545a0418c1dd5b6b1b5", + "2aab7297963040f1900f0bc1f24e7b2a", + "8182a23b5be640cc8a48c09a4ed9585c", + "c3e198ae77684d61bf5fc30a35d8fc11", + "aaa71d52156549a4b8d7aad390497ac3", + "6387cbc474144e59aac5e3b42e714887", + "efc4f9ade28a4bb2a67c0ae4ceecbf28", + "7c105cfe1f344bf7896c7ddc0fcdc322", + "c1f71dcbe98f4ee9847af6b800979e06", + "e12dbfa20e9d40448366c9528c1a2c02", + "b3d7aad60442432e96c8c8bd3ead8427", + "0d40464e81fe4c06ac3400204116f243", + "69a0d832b77e47b8a2adcf47efe3f7ab", + "947f13ea22654b4ca6fca7ebed29d64e", + "45becb2c72714dfcb721b3a20a92d28f", + "4ddb5f1d8260448981c67308bcedecbf", + "d43db93804c24908bb6d75f26b640199", + "71d656fe70004c6db5d23d86bc6b108b", + "9f2ecadf1f3f4e399aad1882f2fe9b00", + "8826a0177e334508932a43563d2ae97d", + "f9ecbb00f95548d5b0c5cad345b1e38a", + "ac78d9726bc541df9907425442d3a51a", + "854937001e534695b08ee25f6e443962", + "06b21316e1ef41c9b7c9d943a9ff91ec", + "d38618d4c0e64b7baa61c0eca47427e5", + "86e235a532f347c781d6654c3ac25ba3", + "1727de01c47144b2958babfb91e887cb", + "5ac310c605f64948ad744bc1f196441d", + "32c1cc0d5327462d9175c74b91d67c4d", + "910d8a34abaa4f92a899dd4f5ab03d74", + "fbd92ad5a793482aac5570387e917188", + "ac81634b0e0946d690fb7d8ad7aed911", + "01097fb41b9c4cbf91300e049d9f3617", + "abeae554ecff4bb0ad8c38fa2829f706", + "a2d84cfc801f4657bade62d42be7046f", + "06a2114630234393bc0f07b3a64455f8", + "c6f31bcf48de4d9fbc7f2a2d9984b247", + "d5eace6b280e446ead2d1517801e4612", + "e29da03f6e474f1ca97ecfa6cd09658c", + "35bfe48a1b744262a8ea68bf5b5d495d", + "eed4b0b927824444aa8d875281cca1c4", + "e77316215dbe41daa8e89d8b2cc0f032", + "11c60d67f2204518b007ef47c801fbff", + "da21830428b54f76aa31b03efce202b9", + "a90285006dda4eb8a3e77964294a76ea", + "8164a08b9b3f49288963539305eadabe", + "591e6dc92b5c44418260cf659c5807cf", + "2bd3c665eb784f149dc21853103e8ff0", + "1ff4bcdb97294287ae8f3e9f2dc6bafa", + "bc265b1f822348469e3c8df0ea608abb", + "3635aa8a2cee484b945ddf7379ff4102", + "94ce06b3df27425eb8ae1a0aade4244d", + "e8ab0bcb4d7f4a298b2b3555d866a11f", + "5570fad8913d4bb495681b5e1dbe3950", + "47735f6c830149db965660d6b2f200d7", + "61d980530bf0408c8e2ed9a7997ba615", + "8a9f0a924d53496c8a8f228738ec140d", + "60ab33e3e0d84395a1269604f0fae91f", + "64d829532bb94214b805c2de4cf529cc", + "438c02b8134e45d5b2760b2e1f72f004", + "0f09296d37ee44b89871fa22cdd0127f", + "0fd475cb9e064d10a8ed031957cf2044", + "d1d2687d51a4442d8555ed4071837da4", + "0d80cc1f4fcc49d59e3a80862678fd86", + "9e556858d4a44b5bbc3e3af87d138a55", + "d86eb0ba47884ab081128a8761e9654b", + "620e2fa48504435a85d95c2d4b264b6e", + "af9af54477cc4972bf0f0a99c1344974", + "0b5d06cb83334b53a91c929f8e308543", + "8b9f3c47205d474b97efc6e9c6fb5f68", + "b63d8ddcad7745f3b4d7e683d23f393a", + "b83993ef787047cb9a31652fdd7f9ee7", + "f4ee33b4a2d145bab3c5c2e14c73a3f8", + "bebb055c0ac14d59a8b617399d60e602", + "888e04dcb56f4c43954d49d3e392ab25", + "6058ac7bc0b345ee8f5d2f631b7b6940", + "ec94298c63ab4b0e83c074a9d2ed4fc9", + "0d56de85d9244f38a8a0b3d84ee5d7da", + "b0053efc5ecc411f902fcf3b19cd362e", + "1d69552268b74bfb824c4f783e362949", + "de35e43c6aea490b917084a93c4571fb", + "8fdc7615fa0b412283eb3beb36b97872", + "d5542140ebd34dfaa8c66f2f3e48fe92", + "9eaafbfddcec4cdb998770d2cefc8fb7", + "a9e2ae7f987b4d9d9636c3963530d8ed", + "283159e7918540efa39d255f475dd984", + "2f5aa471e247475691da674db1d8514c", + "c67e91be78c74ba9b816e40dd5c181ae", + "d02de13e2d7843298e61b8f47d8dee33", + "76d9a70692e34521a609b337755d9901", + "8a1465ca8728490dba4fd79730ea6a30", + "3d563f4f9d28464788cab663cc814cf4", + "6fe391246afa49e08eec5793a97690db", + "47cc4883459449af8f9b35cd74b84002", + "69da0360a8ae4737b6e1af2e790f2b85", + "d7cc01fb605b4dd58cac287c36b6afea", + "638f9ac5607b42d3a467295cb8f7c50d", + "8e885a254c6f4311a3774d816e5ef5ac", + "580addd60f83499386b626c6440c6fca", + "6bb5cbb9ce7645c4ad45cdf056af0445", + "2e3ea9571d364a40aa9917a6f49b45f7", + "665f8e9a73b94639aa42743d16726a96", + "34a2309f1b78432ab51a87c964946da8", + "078db166712a4daba8e99ccdf44eb16f", + "dde4b35063b64a28a2fd5412eb9474f0", + "4d9c96f9caa54ac69dbee2755cfd804d", + "aa9d3e82fed541cca0fffe35b55aaabf", + "3221446fbbc24420a923884c67e0b87c", + "abf0151dabeb49eab089f921c8f364b5", + "488797401b9e419ab393ad5b2438039e", + "4394b231af2f4e1499a308c93b0ff951", + "4f1a59e4dff4470fb123ab315ded6e4f", + "5ead730bc1c34a28b5b046ae270d04e6", + "13b77ec58e65475b95d0041f90639e9a", + "deb2a67f32e64ebf87758c3ace7916e8", + "c594b48ac5b347f78c099d581dc4cd96", + "68fd129101844ab18b2b107778873d54", + "d7e07795e63c4ab78ca92961ba089b07", + "d3450ca5684b4a5680c114d29a7ce8f5", + "61ed075433b94feda586eec035251768", + "c46be587d0fe4ef0b364f822f5ff903d", + "c7f7e4ee797749c0939c9a3926937b41", + "2a4d28248796477994a17db0fb8485dc", + "0dbdea008e964ad887f112336be78449", + "47977de9d961442682805f37f7217387", + "241f6ce34fd242ca9a46f8232a9fb838", + "846eac2286dd4d6991f80b6ae03ce804", + "72858c26d6154ef3b8f90ffb0339781e", + "4085f5c89ca94a31b346452cd8009dad", + "47c175c19d6b4268a2ade2966327de78", + "f3dc7118a9fa4b188cc3a9aaf366b125", + "9f07e155db5a473abdd7b7ae0617e770", + "05feab0298c54df4a2372152d4f3a891", + "abeb4c3be90344469c004e29465e1580", + "540e6158656d412591a5442eb89d1e65", + "51a0e1eb84eb4dd6a95f30268541ccbc", + "fe7d093f30854ee1b66f080c5b8fb68b", + "880e13da115f4e2e9d413b75b0eecdcb" + ] + }, + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:36.853391Z", + "shell.execute_reply.started": "2023-12-22T03:35:33.398019Z", + "to_execute": "2023-12-22T03:35:33.384Z" + }, + "id": "d0b36e7eff50657c", + "libroFormatter": "formatter-string", + "outputId": "4198784f-15c6-4812-f96a-c3f62914dbbb", + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "boolq example: \n", + "{'input': 'Persian language -- Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.\\nQuestion: do iran and afghanistan speak the same language\\nA. Yes\\nB. No\\nAnswer:', 'output': 'A', 'task_name': 'boolq'}\n", + "multirc example: \n", + "{'input': 'While this process moved along, diplomacy continued its rounds. Direct pressure on the Taliban had proved unsuccessful. As one NSC staff note put it, \"Under the Taliban, Afghanistan is not so much a state sponsor of terrorism as it is a state sponsored by terrorists.\" In early 2000, the United States began a high-level effort to persuade Pakistan to use its influence over the Taliban. In January 2000, Assistant Secretary of State Karl Inderfurth and the State Department\\'s counterterrorism coordinator, Michael Sheehan, met with General Musharraf in Islamabad, dangling before him the possibility of a presidential visit in March as a reward for Pakistani cooperation. Such a visit was coveted by Musharraf, partly as a sign of his government\\'s legitimacy. He told the two envoys that he would meet with Mullah Omar and press him on Bin Laden. They left, however, reporting to Washington that Pakistan was unlikely in fact to do anything,\" given what it sees as the benefits of Taliban control of Afghanistan.\" President Clinton was scheduled to travel to India. The State Department felt that he should not visit India without also visiting Pakistan. The Secret Service and the CIA, however, warned in the strongest terms that visiting Pakistan would risk the President\\'s life. Counterterrorism officials also argued that Pakistan had not done enough to merit a presidential visit. But President Clinton insisted on including Pakistan in the itinerary for his trip to South Asia. His one-day stopover on March 25, 2000, was the first time a U.S. president had been there since 1969. At his meeting with Musharraf and others, President Clinton concentrated on tensions between Pakistan and India and the dangers of nuclear proliferation, but also discussed Bin Laden. President Clinton told us that when he pulled Musharraf aside for a brief, one-on-one meeting, he pleaded with the general for help regarding Bin Laden.\" I offered him the moon when I went to see him, in terms of better relations with the United States, if he\\'d help us get Bin Laden and deal with another issue or two.\" The U.S. effort continued. \\nQuestion: What did the high-level effort to persuade Pakistan include?\\nAnswer: Children, Gerd, or Dorian Popa\\nIs it true?\\nA. Yes\\nB. No\\nAnswer:', 'output': 'B', 'task_name': 'multirc'}\n", + "rte example: \n", + "{'input': 'No Weapons of Mass Destruction Found in Iraq Yet.\\nWeapons of Mass Destruction Found in Iraq.\\nIs the sentence below entailed by the sentence above?\\nA. Yes\\nB. No\\nAnswer:', 'output': 'B', 'task_name': 'rte'}\n", + "wic example: \n", + "{'input': \"Sentence 1: Do you want to come over to my place later?\\nSentence 2: A political system with no place for the less prominent groups.\\nAre 'place' in the above two sentences the same?\\nA. Yes\\nB. No\\nAnswer:\", 'output': 'B', 'task_name': 'wic'}\n" + ] + } + ], + "source": [ + "# boolq\n", + "boolq_dataset = (\n", + " load_dataset(\"super_glue\", \"boolq\")\n", + " .map(\n", + " lambda x: {\n", + " \"input\": f\"{x['passage']}\\nQuestion: {x['question']}\\nA. Yes\\nB. No\\nAnswer:\",\n", + " # 0 - False\n", + " # 1 - True\n", + " \"output\": [\"B\", \"A\"][int(x[\"label\"])],\n", + " \"task_name\": \"boolq\",\n", + " }\n", + " )\n", + " .select_columns([\"input\", \"output\", \"task_name\"])\n", + ")\n", + "print(\"boolq example: \")\n", + "print(boolq_dataset[\"train\"][0])\n", + "\n", + "# multirc\n", + "multirc_dataset = (\n", + " load_dataset(\"super_glue\", \"multirc\")\n", + " .map(\n", + " lambda x: {\n", + " \"input\": (\n", + " f\"{x['paragraph']}\\nQuestion: {x['question']}\\nAnswer: {x['answer']}\\nIs it\"\n", + " \" true?\\nA. Yes\\nB. No\\nAnswer:\"\n", + " ),\n", + " # 0 - False\n", + " # 1 - True\n", + " \"output\": [\"B\", \"A\"][int(x[\"label\"])],\n", + " \"task_name\": \"multirc\",\n", + " }\n", + " )\n", + " .select_columns([\"input\", \"output\", \"task_name\"])\n", + ")\n", + "print(\"multirc example: \")\n", + "print(multirc_dataset[\"train\"][0])\n", + "\n", + "# rte\n", + "rte_dataset = (\n", + " load_dataset(\"super_glue\", \"rte\")\n", + " .map(\n", + " lambda x: {\n", + " \"input\": (\n", + " f\"{x['premise']}\\n{x['hypothesis']}\\nIs the sentence below entailed by the\"\n", + " \" sentence above?\\nA. Yes\\nB. No\\nAnswer:\"\n", + " ),\n", + " # 0 - entailment\n", + " # 1 - not_entailment\n", + " \"output\": [\"A\", \"B\"][int(x[\"label\"])],\n", + " \"task_name\": \"rte\",\n", + " }\n", + " )\n", + " .select_columns([\"input\", \"output\", \"task_name\"])\n", + ")\n", + "print(\"rte example: \")\n", + "print(rte_dataset[\"train\"][0])\n", + "\n", + "# wic\n", + "wic_dataset = (\n", + " load_dataset(\"super_glue\", \"wic\")\n", + " .map(\n", + " lambda x: {\n", + " \"input\": (\n", + " f\"Sentence 1: {x['sentence1']}\\nSentence 2: {x['sentence2']}\\nAre '{x['word']}'\"\n", + " \" in the above two sentences the same?\\nA. Yes\\nB. No\\nAnswer:\"\n", + " ),\n", + " # 0 - False\n", + " # 1 - True\n", + " \"output\": [\"B\", \"A\"][int(x[\"label\"])],\n", + " \"task_name\": \"wic\",\n", + " }\n", + " )\n", + " .select_columns([\"input\", \"output\", \"task_name\"])\n", + ")\n", + "print(\"wic example: \")\n", + "print(wic_dataset[\"train\"][0])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9fca2225-aaee-47aa-957a-5f8ed3177cdb", + "metadata": { + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:36.858952Z", + "shell.execute_reply.started": "2023-12-22T03:35:36.855329Z", + "to_execute": "2023-12-22T03:35:36.819Z" + }, + "id": "9fca2225-aaee-47aa-957a-5f8ed3177cdb", + "libroFormatter": "formatter-string" + }, + "outputs": [], + "source": [ + "# define a task2id map\n", + "TASK2ID = {\n", + " \"boolq\": 0,\n", + " \"multirc\": 1,\n", + " \"rte\": 2,\n", + " \"wic\": 3,\n", + "}\n", + "\n", + "\n", + "def tokenize(examples):\n", + " inputs, targets = examples[\"input\"], examples[\"output\"]\n", + " features = tokenizer(inputs, max_length=512, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n", + " labels = tokenizer(targets, max_length=2, padding=\"max_length\", truncation=True, return_tensors=\"pt\")\n", + " labels = labels[\"input_ids\"]\n", + " labels[labels == tokenizer.pad_token_id] = -100\n", + " features[\"labels\"] = labels\n", + " features[\"task_ids\"] = torch.tensor([[TASK2ID[t]] for t in examples[\"task_name\"]]).long()\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0bf6c31c-73cd-4eed-931b-0cad5d7290fb", + "metadata": { + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:36.929414Z", + "shell.execute_reply.started": "2023-12-22T03:35:36.860477Z", + "to_execute": "2023-12-22T03:35:36.849Z" + }, + "id": "0bf6c31c-73cd-4eed-931b-0cad5d7290fb", + "libroFormatter": "formatter-string", + "tags": [] + }, + "outputs": [], + "source": [ + "def get_superglue_dataset(\n", + " split=\"train\",\n", + " n_samples=500,\n", + "):\n", + " ds = concatenate_datasets(\n", + " [\n", + " boolq_dataset[split].shuffle().select(range(n_samples)),\n", + " multirc_dataset[split].shuffle().select(range(n_samples)),\n", + " rte_dataset[split].shuffle().select(range(n_samples)),\n", + " wic_dataset[split].shuffle().select(range(n_samples)),\n", + " ]\n", + " )\n", + " ds = ds.map(\n", + " tokenize,\n", + " batched=True,\n", + " remove_columns=[\"input\", \"output\", \"task_name\"],\n", + " load_from_cache_file=False,\n", + " )\n", + " return ds" + ] + }, + { + "cell_type": "markdown", + "id": "oNvh2WGlLo4z", + "metadata": { + "id": "oNvh2WGlLo4z", + "libroFormatter": "formatter-string" + }, + "source": [ + "As a toy example, we only select 1,000 from each subdataset for training and 100 each for eval." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1bf88dd1a6aaa6a5", + "metadata": { + "collapsed": false, + "execution": { + "shell.execute_reply.end": "2023-12-22T03:35:44.953151Z", + "shell.execute_reply.started": "2023-12-22T03:35:37.023791Z", + "to_execute": "2023-12-22T03:35:37.009Z" + }, + "libroFormatter": "formatter-string" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Map: 0%| | 0/4000 [00:00 **Note:** The current implementation differs from the original QA-LoRA paper's approach. + +While the QA-LoRA paper describes a direct weight modification technique using "beta shift" to modify quantized weights without full dequantization, this implementation uses a different approach: + +1. The quantized model is first dequantized to full precision +2. The QALoRA adapter weights are then merged with the dequantized model +3. The merged model must be re-quantized if quantization is still desired + + +### Memory Considerations + +This process requires significant memory (enough to hold the full dequantized model) and additional computation for the re-quantization step. For large models, this may not be possible on consumer hardware. + +For most use cases, we recommend keeping the base quantized model and the QALoRA adapter separate, loading them with `PeftModel.from_pretrained()` as shown in the usage example above. This approach maintains the memory efficiency benefits of quantization throughout the deployment pipeline. + + +## Citation +``` +@article{dettmers2023qlora, + title={QLoRA: Efficient Finetuning of Quantized LLMs}, + author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:2305.14314}, + year={2023} +} + +@article{xu2023qalora, + title={QA-LoRA: Quantization-Aware Low-Rank Adaptation of Large Language Models}, + author={Xu, Yuhui and Liu, Lingxi and Rao, Longhui and Zhao, Teng and Xiong, Zhiwei and Gao, Mingkui}, + journal={arXiv preprint arXiv:2309.14717}, + year={2023} +} +``` diff --git a/peft/examples/qalora_finetuning/qalora_gptq_finetuning.py b/peft/examples/qalora_finetuning/qalora_gptq_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..7a39d264531f99b97c8854466b06a9b2979ce2a9 --- /dev/null +++ b/peft/examples/qalora_finetuning/qalora_gptq_finetuning.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python3 +""" +Training script for fine-tuning language models with QALoRA using GPTQ quantization. +This script supports cached quantization to avoid repeating expensive quantization processes. +""" + +import argparse +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + DataCollatorForLanguageModeling, + GPTQConfig, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, get_peft_model + + +def load_or_quantize_model( + base_model: str, tokenizer, bits: int = 4, cache_dir: str = "./quantized_models" +) -> AutoModelForCausalLM: + """ + Load a pre-quantized model from cache or quantize and cache a new one. + Automatically detects if the model is already GPTQ-quantized. + + Args: + base_model: Model identifier or path + tokenizer: Tokenizer for the model + bits: Bit-width for quantization (default: 4) + cache_dir: Directory to store quantized models + + Returns: + The loaded (quantized) model + """ + # First, check if the model is already GPTQ-quantized by trying to load it + print(f"Checking if {base_model} is already GPTQ-quantized...") + try: + # Try to load the model and check if it has GPTQ quantization + test_model = AutoModelForCausalLM.from_pretrained( + base_model, + device_map="auto", + torch_dtype=torch.float16, + trust_remote_code=True, # Some GPTQ models might need this + ) + + # Check if the model has GPTQ quantization attributes + has_gptq = False + for module in test_model.modules(): + if hasattr(module, "qweight") or hasattr(module, "qzeros") or "gptq" in str(type(module)).lower(): + has_gptq = True + break + + if has_gptq: + print(f"✅ Model {base_model} is already GPTQ-quantized. Using directly.") + return test_model + else: + print(f"Model {base_model} is not GPTQ-quantized. Will quantize it.") + # Clean up the test model to free memory + del test_model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + except Exception as e: + print(f"Could not load model {base_model} directly: {e}") + print("Will attempt to quantize it...") + + # If we get here, the model needs to be quantized + os.makedirs(cache_dir, exist_ok=True) + model_id = base_model.replace("/", "_").replace("\\", "_") # Handle Windows paths too + quantized_model_path = os.path.join(cache_dir, f"{model_id}_gptq_{bits}bit") + + # Check if we already have a cached quantized version + if os.path.exists(quantized_model_path) and os.path.exists(os.path.join(quantized_model_path, "config.json")): + print(f"Loading pre-quantized model from cache: {quantized_model_path}") + return AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") + + print(f"Quantizing model and saving to cache: {quantized_model_path}") + + # Configure GPTQ for first-time quantization + gptq_config = GPTQConfig( + bits=bits, + dataset="c4", + tokenizer=tokenizer, + group_size=128, + desc_act=False, + sym=False, + ) + + # Load and quantize the model + model = AutoModelForCausalLM.from_pretrained( + base_model, device_map="auto", quantization_config=gptq_config, torch_dtype=torch.float16 + ) + + # Save the quantized model to cache + print(f"Saving quantized model to {quantized_model_path}") + model.save_pretrained(quantized_model_path) + tokenizer.save_pretrained(quantized_model_path) + + return model + + +def tokenize_and_preprocess(examples, tokenizer, max_length: int = 128): + """ + Tokenize text data and prepare it for language modeling. + + Args: + examples: Dataset examples with 'text' field + tokenizer: Tokenizer to use + max_length: Maximum sequence length + + Returns: + Processed examples with input_ids and labels + """ + # Tokenize the text with truncation and padding + tokenized_output = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length) + + # Preprocess labels (set pad tokens to -100 for loss masking) + labels = tokenized_output["input_ids"].copy() + labels = [[-100 if token == tokenizer.pad_token_id else token for token in seq] for seq in labels] + tokenized_output["labels"] = labels + + return tokenized_output + + +def train_model( + base_model: str, + data_path: str, + data_split: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + use_qalora: bool, + eval_step: int, + save_step: int, + device: str, + lora_r: int, + lora_alpha: int, + lora_dropout: float, + lora_target_modules: str, + push_to_hub: bool, + qalora_group_size: int, + bits: int, +) -> None: + """ + Train a model with QALoRA and GPTQ quantization. + + Args: + base_model: Base model to fine-tune + data_path: Dataset path + output_dir: Directory to save model outputs + batch_size: Training batch size + num_epochs: Number of training epochs + learning_rate: Learning rate + cutoff_len: Maximum sequence length + val_set_size: Validation set size + use_dora: Whether to use DoRA + use_qalora: Whether to use QALoRA + quantize: Whether to use quantization + eval_step: Steps between evaluations + save_step: Steps between saving checkpoints + device: Device to use (cuda:0, xpu:0, etc.) + lora_r: LoRA rank + lora_alpha: LoRA alpha + lora_dropout: LoRA dropout rate + lora_target_modules: Target modules for LoRA + push_to_hub: Whether to push to Hugging Face Hub + """ + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + device = torch.device(device) + print(f"Using device: {device}") + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + # Load or quantize model + model = load_or_quantize_model(base_model, tokenizer, bits=bits) + + # Configure LoRA + target_modules = ( + lora_target_modules.split(",") + if lora_target_modules + else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + ) + + print("use_qalora", use_qalora) + lora_config = LoraConfig( + task_type="CAUSAL_LM", + use_qalora=use_qalora, + qalora_group_size=qalora_group_size, + r=lora_r, + lora_alpha=lora_alpha, + target_modules=target_modules, + lora_dropout=lora_dropout, + bias="none", + ) + + # Get PEFT model with adapters + model = get_peft_model(model, lora_config) + + model.print_trainable_parameters() + + # Move model to device if not already there + if not hasattr(model, "device") or model.device.type != device.type: + model = model.to(device) + + # Load and prepare dataset + dataset = load_dataset(data_path, data_split) + + tokenized_datasets = { + "train": dataset["train"].map( + lambda x: tokenize_and_preprocess(x, tokenizer, max_length=cutoff_len), + batched=True, + remove_columns=["text"], + load_from_cache_file=True, + ), + "test": dataset["test"].map( + lambda x: tokenize_and_preprocess(x, tokenizer, max_length=cutoff_len), + batched=True, + remove_columns=["text"], + load_from_cache_file=True, + ), + } + + # Data collator for language modeling + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Configure training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + label_names=["labels"], + ) + + # Clear accelerator cache to free memory + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Initialize trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + # Start training + print("\nStarting training...") + trainer.train() + + # Save the final model + if push_to_hub: + trainer.push_to_hub(commit_message="Fine-tuned model with QALoRA") + + # Always save locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + print(f"\nTraining complete. Model saved to {output_dir}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fine-tune LLMs with QALoRA and GPTQ quantization") + + # Model and dataset parameters + parser.add_argument("--base_model", type=str, default="TheBloke/Llama-2-7b-GPTQ", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument("--data_split", type=str, default="", help="Dataset path or name") + + parser.add_argument( + "--output_dir", type=str, default="./qalora_output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--bits", type=int, default=4, help="Init quantization bits") + + # Training parameters + parser.add_argument("--batch_size", type=int, default=4, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=128, help="Max sequence length") + + # Adapter configuration + parser.add_argument("--use_qalora", action="store_true", help="Apply QALoRA") + parser.add_argument("--qalora_group_size", type=int, default=32, help="LoRA rank") + parser.add_argument("--lora_r", type=int, default=8, help="LoRA rank") + parser.add_argument("--lora_alpha", type=int, default=16, help="LoRA alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + + # Training process options + parser.add_argument("--eval_step", type=int, default=100, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=500, help="Save step interval") + parser.add_argument("--device", type=str, default="auto", help="Device to use for training") + + # Hugging Face Hub options + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + + args = parser.parse_args() + + device = args.device + if args.device == "auto": + device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + + # If use_qalora isn't explicitly set in args but passed to train_model + if not args.use_qalora: + args.use_qalora = True # Default to True as in the original code + + train_model( + base_model=args.base_model, + data_path=args.data_path, + data_split=args.data_split, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + use_qalora=args.use_qalora, + eval_step=args.eval_step, + save_step=args.save_step, + device=device, + lora_r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + push_to_hub=args.push_to_hub, + qalora_group_size=args.qalora_group_size, + bits=args.bits, + ) diff --git a/peft/examples/randlora_finetuning/README.md b/peft/examples/randlora_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fa9d2d61de529445a81c5ab7183951d01dbca26e --- /dev/null +++ b/peft/examples/randlora_finetuning/README.md @@ -0,0 +1,112 @@ +# RandLora: Full-rank parameter-efficient fine-tuning of large models + +## Introduction +[RandLora](https://huggingface.co/papers/2502.00987) is a parameter-efficient fine-tuning technique that is similar to LoRA and VeRA but performs full rank updates to improve performance. RandLora can be particulary usefull when adapting large model to hard tasks that require complex updates while preserving the parameter efficiency of LoRA. The full rank update of RandLora is acheived by linearly scaling random bases. The random bases are a collection of multiple low rank matrices such that the summation of their ranks if greater or equal to the full rank of the parameter matrices. The trainable parameters of RandLora are two diagonal matrices (vectors) that get multiplied with the right hand low rank random bases, in a similar way to VeRA's update. To maintain low memory usage, RandLora uses a custom function that prevents storing unnecessary bases in memory for backpropagation. + +## Quick start +```python +import torch +from peft import RandLoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") +dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") +randlora_config = RandLoraConfig() + +peft_model = get_peft_model(model, lora_config) +trainer = transformers.Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("randlora-llama-7b") +``` + +There is no additional change needed to your standard PEFT training procedure, simply swap your `LoraConfig` for a `RandLoraConfig`. Note however that RandLora's trainable parameter count is **inversely proportional** to the rank parameter `r`. Lower `r` to increase and increase it to reduce trainable parameters of RandLora. + +Run the finetuning script simply by running: +```bash +python examples/randlora_finetuning/randlora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco +``` +This 👆🏻 by default will load the model in peft set up with RandLora config. Now if you wanna quickly compare it with Lora, all you need to do is to input ` --use_lora` in the command line and reduce `--randlora_alpha` to 2x the rank. So same above example would be 👇🏻; + +```bash +python examples/randlora_finetuning/randlora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco --use_lora --rank 32 --randlora_alpha 64 +``` + +RandLora can be made to use sparse or very sparse random bases. These sparse matrices can help reduce overfitting. Add `--very_sparse` to run with very sparse matrices or `--sparse` for sparse matrices: + +```bash +python examples/randlora_finetuning/randlora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --sparse +``` + +RandLora also supports quantization. To use 4-bit quantization try: + +```bash +python examples/randlora_finetuning/randlora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --quantize +``` + +By default the RandLora layers are the key and value layers of LLama model. Adding adapters on more layers will increase memory usage. If you wish to choose a different set of layers for RandLora to be applied on, you can simply define it using: +```bash +python examples/randlora_finetuning/randlora_finetuning.py --randlora_target_modules "q_proj,k_proj,v_proj" +``` + +### Full example of the script +```bash +python randlora_finetuning.py \ + --base_model "PATH_TO_MODEL" \ + --data_path "PATH_TO_DATASET" \ + --output_dir "PATH_TO_OUTPUT_DIR" \ + --batch_size 1 \ + --num_epochs 3 \ + --learning_rate 3e-4 \ + --cutoff_len 512 \ + --val_set_size 500 \ + --quantize \ + --eval_step 10 \ + --save_step 100 \ + --device "auto" \ + --rank 32 \ + --randlora_alpha 640 \ + --randlora_dropout 0.05 \ + --randlora_target_modules "k_proj,v_proj" \ + --hub_model_id "YOUR_HF_REPO" \ + --push_to_hub +``` + +## RandLora vs. LoRA +RandLora differs from LoRA and other related low rank approximation algorithms by chanllenging the low rank paradigm. RandLora adapters learn **full-rank** updates as the [paper](https://huggingface.co/papers/2502.00987) shows that the low rank constraint of LoRA can constrain performance gains as trainable parameters increase (with higher ranks). As a result, using RandLora is specifically recommended for difficult tasks that are underfit by LoRA. RandLoRA however also often improves performance for common tasks. If increasing LoRA's rank improves performance for your task, RandLora will most likely outperform. + +RandLora is expected to increase performance over LoRA for equivalent amounts of trainable parameters, mostly for larger equivalent amounts (> LoRA rank 4). + +RandLora's performance increase comes with two limitations: + +1. Performance is dependent on using a large `randlora_alpha` scaling parameter (usually 20x the basis rank). This large parameter can sometimes make training the update unstable, reduce the learning rate or the scaling parameter if this is the case. + +2. Increase training time over LoRA when using very low RandLora basis ranks. + +## RandLora vs. VeRA +RandLora shares similarities with VeRA in that both algorithms use random basis combinations to address some of LoRA's limitations. The limitations addressed by each algorithm is however different. +VeRA aims to reduce trainable parameters beyond rank 1 LoRAs while RandLoRA reduces the performance limitation due to the low rank of the update as the trainable parameter count increases. + +RandLora is expected to: + +1. Improve performance over VeRA when more trainable parameters are required (hard tasks) + +2. Reduce memory usage over VeRA thanks to RandLora's random base sharing strategy + + +## Citation +``` +@inproceedings{2025_ICLR_RandLoRA, + title="{RandLoRA: Full rank parameter-efficient fine-tuning of large models}", + author="Albert, Paul and Zhang, Frederic Z. and Saratchandran, Hemanth and Rodriguez-Opazo, Cristian and van den Hengel, Anton and Abbasnejad, Ehsan", + booktitle="{International Conference on Learning Representations (ICLR)}", + year="2025" +} +``` diff --git a/peft/examples/randlora_finetuning/qrandlora_finetuning.ipynb b/peft/examples/randlora_finetuning/qrandlora_finetuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..887cf1f516975c7cbf2d78e04bd524e83abc111d --- /dev/null +++ b/peft/examples/randlora_finetuning/qrandlora_finetuning.ipynb @@ -0,0 +1,8100 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "CV_gQs58bsvM" + }, + "source": [ + "# Fine-tuning [Llama3-8b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) Dataset using QRandLora (quantized RandLora) on T4 Free Colab GPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FuXIFTFapAMI", + "outputId": "b95d8260-65bd-405f-f1e2-8d353aa46814" + }, + "outputs": [], + "source": [ + "# Install the libraries\n", + "!pip install -q -U bitsandbytes\n", + "!pip install -q -U git+https://github.com/huggingface/transformers.git\n", + "!pip install -q -U git+https://github.com/huggingface/peft.git\n", + "!pip install -q -U git+https://github.com/huggingface/accelerate.git\n", + "!pip install -q datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "8cc86330c2af436c9af314e8c04c8c2b", + "e25f9ca445b14e3f8397779df071dfb4", + "8365680c634a44aa880317e36fa5e46e", + "0c4ac7c3db0b431397cc812f7c9e785c", + "c84f542c863043dea8a3675fa153e78d", + "b3b3f4ddd4ed4d938c923887939a0440", + "35186465f87341f683affb9399661540", + "791df472db174df69b8c9f0e200af254", + "6bb9c7182d2a464ea21809e59043562a", + "31c574113731403b88edc5bb0798bc6d", + "3b8bc5b9392e45758813a1db9db824a9", + "90661b333d6f496ca606b3046622660e", + "5f551f9b217e44cf8b5433f314b3844b", + "d2d81cc8296c4b10bf80b86c0a3302d3", + "7e3a386e672f4748882211227b7721a9", + "57f251691b4c453896b2508c431dfc2f", + "4bdb196cd1494f809829651ec5b6cbf8", + "7cd50bcc8fcc4b83abcda6d3604bd4cc", + "7a00aa4a97a34da39cc052c6926dbe13", + "14c73d88df9e46e3bbb6690fdb48ad07", + "0e2beab611114239b6ee48a3cbb09c49", + "006b78b5191b4fb888d98bdf6c20ec1e", + "5f6ffa1d929443a5bd9c7c550f0690f0", + "668a7f88506148a9ba2b48920afc028f", + "57b0096985ab44aea342e52795c4f999", + "a4c404e420cc4ce781ce569f9ab3f987", + "ee4e4af964ec4dd597cb04a90f0697f9", + "974e3687f18a4e1a975969b880d086aa", + "93a50117ece543d4857ba02505dc4514", + "71a3a56edbdb45669d382fef4b097e1b", + "53f287d4927541d08e2ae7d4d0b3c396", + "afa442ab223b46cb82569438c0047823" + ] + }, + "id": "wAAPv5CRmg7e", + "outputId": "687f979a-04c1-4160-d71c-4de8ecdb07d9" + }, + "outputs": [], + "source": [ + "# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace\n", + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MJ-5idQwzvg-" + }, + "source": [ + "### Loading the model and its tokenizer in quantized setup!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 546, + "referenced_widgets": [ + "5924b266e95a42039634a334ff561a82", + "eadeec171e7b4c0f9e26964f031cfb71", + "feae525923d5407bb69a922954c474f7", + "00371a48e64c45cd97020a78b710e64c", + "156f95b0012449e8a0c604e6e03bf35f", + "de3757d6125a4c07b502dd60816bafec", + "8c4d6f4eea3742289a2604e66b0c6182", + "bac377ed96ae4e8db9b298bb623888ec", + "02d6cc4c2717434c895798601bda7c86", + "3f9fa554747743f8a86b40a4f7530617", + "ffe561df8772443ebf40a3b8b656079f", + "800e9453214848b69bc4c6ca2d5e8f79", + "6b0ec8d5f7294d44a5fa15d8ef12471e", + "7d3a7be9ed6f48988a2c4a1a4a2271cf", + "c40f583823574e40b6b29d4914143c0e", + "a4368e6da8f046aaa32f3152b7d333d1", + "8ea89e52123643268857285e0e1db1c0", + "a8514e34378d47a28fbf0831a14ede8f", + "f86b969ef69b48119619e1a424b50460", + "7725e9d443e249ada02e5ac7056d00db", + "f81756eb9e554899b0778311f2c407c4", + "3b614b9712874fac990d2c557b0791a6", + "43d12a98d90a4bf7a96c033172c646e2", + "c35b16156253402f90a432f3f07c2e0a", + "1ae1d2702da5483a85504f59939ffa39", + "1b2abf90003e4165a3293acd6a5ea9ff", + "3e45aea9f7444a4db885c4cca4c9c4ff", + "6b6ed29053ec4aaa8fc5526a35f17c2b", + "e69cd88ccbae4bb7b238fa112a60f0f9", + "8cd63d3908e4411c9fcb42bc32c8dd16", + "64624b26145b42db82f7afc36c32e117", + "cffdf12fbe97462ab74e88ccca943aeb", + "bcaf4c81ba9d437bb6223dbb22d011ed", + "8800c351b6da450eace0c3890d36c8d7", + "7037c32dfce84e70ac86537dbbc6a495", + "31c10fa464e24f97b379675a204a09b5", + "6149752353fe4f9cbb7b26bcc25199a9", + "0b145e421f4840f2872c29256b49f168", + "0f4c664612364dc89acf78eb1c740980", + "0f60f9aa76b941809e013ffcae83604a", + "1d27ab2bc6ae463a806292b68b7891f8", + "2a57bb48e1c6475abba242994a79d44a", + "6cb8065803724d80b82b06dc95ded91e", + "8301c6302df54bbc9f15295f11cec208", + "0aad2d9d1cba40cbb64308ede3242ed7", + "0711e28e06a440c2a241acbc1f90d1e8", + "77704d2e27e94cd3a0c5f6b5ceeffd1c", + "3b82b8d41b134bec9bd77ed8d4f00eb4", + "25fef90e209f4b14a73f3e39d226d913", + "f4ca7b63d7d749ff83a848e250f03ec1", + "8c149bc655a34fe5b91853c66db458a9", + "0f54e8fda93144f6a95493e6ec535e9d", + "880124db7dc04aaea09edd75e1ec7921", + "14bf612f6ad7416c8ddd6085c72eee0e", + "f7e59b47f9b74523843f37268212d566", + "a34b3fd5859a441f89cbe7f6e6df9da9", + "8f5b8c513b164dab9e0892422163c483", + "5b2a671976fa446db408d58a215b8249", + "c934919f617447cfb9226929e7a68d79", + "124a70bfad434c5c946f611c04a91c8f", + "cd11fb7d54bb43ae821f2272d075a1b3", + "fbc6a2834c5442fbb6667f1b3612bb5b", + "63899ac621ff4e9cb8e215d5ab63bef8", + "6b2b59d2b62b4f7da8c60ff783138397", + "c27e8ce031884a90b41d8220b1870bc4", + "88024cd312ee42c2925ebfbe52077780", + "cf6d1be81b6c4ffc81ce8fdabfc5ad28", + "07e0aed682fd4cc88fa75c0592dc04a7", + "67b4473eb8a44a96ba34983762ab38fa", + "c7e06fd82f7f4f9fb81c68e8758f2de1", + "3d9d8278667d496aaea1eaaa4d24ae93", + "47944ad8cadf4a57b170193c46d4389c", + "db05b25cb38140bdb21e6f3b7fde7e66", + "6f474268da0f4337a2ccecc1ca2098a1", + "c2ceccfdb59b4336a24003cd6bc2403d", + "a8999d04e4114693bb6be358bdbe9b83", + "2540d57e3bf545e3812da1ee72b85fc8", + "c56d8289513441688f9bc5f4b52d60a0", + "a53b4776f95f4dd38197193e6c5f649e", + "b4ba435f6d1c448f99b533bc6df32e76", + "42eb041021214110a860924d28d73409", + "17c797e08bd2493fa685918129415309", + "aea74071600f483b9e6de1a61743c03a", + "21bf14b771c14d2dab9e98a326302e14", + "35c2c635c2024bcda3265bf95d330f63", + "ec014d847e394a309b6a82c30a6fdfc5", + "4c369386ba5f4862b11a50e50130663b", + "bbdf3bb657e64fc2b0a90e78e8886480", + "d7ef74cf4a914ad38a69c84c34fff393", + "8d0f1d547c384094b10aa00a3ede3c06", + "9bdebf06b6874bbb88404f4ad14e1dbc", + "a2249f364b914662b54045a1f8d6dfd1", + "7504986b8d8d4d0da58ad79e80a81948", + "082b6990ce5e4812adc0ad6a7b376dac", + "610e1ddfb7a44d51a54ebea6dad3a5f0", + "5f60910d1e744432bdf87518f0f45874", + "2ab86b3fbd49488bb02f8205a572e752", + "9f13437a44b8434b9cc3afab998e8d3c", + "8a7c82dcbd414b24b67ccfbc562b2e38", + "d1508f5cde9a43d8abc26dd2d0c34dbd", + "9a0b012915c54abeb100f466fa99d303", + "99529129d7f0435da0fdcfc9803a2f11", + "5b56ac3009714a5a84dd8749db4a7bce", + "546b76a22f1046cd856a8fa2f9ff2d9f", + "fffbf696c07744fc8e3d81ab51dc9c90", + "a153cc3ca0cc45c18a941bd57e363ec3", + "5a8ac674153248999007a713299b2644", + "1f82a5685eef4b47a2dbf7618362907c", + "51b3af446ace409dbcdf5de499552061", + "9a12124915994b70a71ebd64b99e93e9", + "57f87d4780634d36ae8159d987c22993", + "58ea619f81bf42ddb8b166db3deb0e86", + "8bb83ae3229e4f38b1733f92f536fad0", + "5c0104210ee34ca8a072ee5121f424a1", + "34e381adbd9242759b57f2a305c5d2e3", + "e4e1a4338c5e46b3ba5a3bb960da7107", + "9a5072b8d16d4a1eb0652da61bda0ac8", + "1f75d85e6c7e4eb6a91b03f0c8adb644", + "9da33f07ea354b5798e85298e132b017", + "604582e8cbff4dc9876551a3307b5b77", + "a98165ee656643ad85ac9ea1447cc775", + "5cf4a57d21a545029b6448258a5ebd84", + "0d2ae3466a3447c58e23ccd2b3733deb", + "ba7f32c41f9247ec9d4c40e6396b55a9", + "ea1bdb5f2da64332960bccd967a84b4a", + "b08631e4cffa445c912da0c8eac2ef23", + "921a1a037f7b47f8b57d1da8192a437a", + "892ff4e2f0e44c23bc5c2be7547cf0bd", + "4c8e98294bd240a6869cb199caee66e1", + "4e1f5423311b4dc0930c21c9ad5a88f5", + "347540dc03d34e65b7ffbb0f5fc569aa", + "7243d8e2e1cc4043a2ee310eabd0ac09" + ] + }, + "id": "E0Nl5mWL0k2T", + "outputId": "a942d9b0-1f38-4a9b-ea20-e55bd7593920" + }, + "outputs": [], + "source": [ + "# setting up the config for 4-bit quantization of QRandLora\n", + "import torch\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", + "\n", + "model_id = \"meta-llama/Meta-Llama-3-8B\"\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type=\"nf4\", bnb_4bit_compute_dtype=torch.bfloat16\n", + ")\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={\"\": 0})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xpx2Fq-icX56" + }, + "outputs": [], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mp2gMi1ZzGET" + }, + "source": [ + "#### Prepare model for PEFT fine-tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "a9EUEDAl0ss3" + }, + "outputs": [], + "source": [ + "from peft import prepare_model_for_kbit_training\n", + "\n", + "model.gradient_checkpointing_enable()\n", + "model = prepare_model_for_kbit_training(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "gkIcwsSU01EB" + }, + "outputs": [], + "source": [ + "def print_trainable_parameters(model):\n", + " \"\"\"\n", + " Prints the number of trainable parameters in the model.\n", + " \"\"\"\n", + " trainable_params = 0\n", + " all_param = 0\n", + " for _, param in model.named_parameters():\n", + " all_param += param.numel()\n", + " if param.requires_grad:\n", + " trainable_params += param.numel()\n", + " print(\n", + " f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HVTAJuKyM0gX" + }, + "source": [ + "### Setup `RandLoraConfig`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ybeyl20n3dYH", + "outputId": "ea35ec70-13e4-4f23-9481-1b33b1b06dec" + }, + "outputs": [], + "source": [ + "from peft import RandLoraConfig, get_peft_model\n", + "\n", + "config = RandLoraConfig(\n", + " r=32,\n", + " randlora_alpha=640,\n", + " target_modules=[\n", + " \"q_proj\",\n", + " \"k_proj\",\n", + " \"v_proj\",\n", + " \"o_proj\",\n", + " \"gate_proj\",\n", + " \"up_proj\",\n", + " \"down_proj\",\n", + " ], # parameters specific to llama\n", + " randlora_dropout=0.05,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\",\n", + ")\n", + "\n", + "model = get_peft_model(model, config)\n", + "print_trainable_parameters(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ybeyl20n3dYH", + "outputId": "ea35ec70-13e4-4f23-9481-1b33b1b06dec" + }, + "outputs": [], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FCc64bfnmd3j" + }, + "source": [ + "## Step 2) Fine-tuning process 💥\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 277, + "referenced_widgets": [ + "b2a19b6092c44b20886987b30f1bf48a", + "1f0efc167b3744b38ff832b71d529318", + "a06b2bd0236249999adffa44e53cf80e", + "23012118a7314a3f838870a2aee9ec90", + "dcff079d850c423a83eb70105b816ee4", + "64911f0e52e74067a1a986c5edfc7f59", + "b4a274fc9e324b80bf559c4dbd05e319", + "5fb4a4ef8afe4ea4af6655faea17f354", + "b1a03a5e9bae46129830daeeb23bf6ff", + "e297072ab5d64815b90bc89d22503378", + "67fbabb9082c4241b8f937b24e0cdd03", + "b1de7b283eeb41828e8093e60c83f2c4", + "bb640a5c858349d29c13ce5629e72f22", + "37523a6cac1047e9a261698212d47737", + "661f76474252493caae8f7d6aa8f99b7", + "849cdc1912aa4df4b0c721a8c63ca0f9", + "5962e77eea5a4d88ba6dbc5e9f51c709", + "f07c8a6ec12f46ea9e32a2208e70bccd", + "191caf3a38eb4191a35f623ce25238f9", + "30b74bd2db8d40d08408013cebcd7661", + "83c355e1418140a5bbad11bf0646b332", + "84d6d2a6afcd423f9b609cbb2d10f00e", + "51180cce01564821a170d1d4b8a9a918", + "03dd6c24f6d94fe7ab85b79d6f6cbeaf", + "f2ab2fa803e94328a237e84cd4ea0027", + "1bd0a270c7ee409c970763398e54fc36", + "e92b30d0b4234af6b5a33bff989b1b45", + "cc5ce633746949ed98418cae9f68afe3", + "4b1f795c4c004cacbf3660d935e52995", + "5690d92586494b9187147f32fa708405", + "317cda72329c4043ab0b224b46b259d3", + "076357d4bb9943bdaa1d6846897786af", + "7b3e136fc9e74a699497a947006f4f1d", + "7e2e097c703a4a0d8556733a0739469c", + "e0fd6d00f0ba4e59bdaa5779556ec4ea", + "cd318c6bfc8e421a9bfcdab16be5eaa7", + "4bc1fd9d480a4799954c69031c071b30", + "25fc6aaf37fc49fa822df29236bf2f90", + "7ac8e88f29f04b859f592a003d39836b", + "0ec2643d9fd44785addb37d9ecd23989", + "d10ba011d05045b18bbfeb9660e4d9d3", + "e56c22f77c884caaacfafd48dfa51a55", + "068eb104d5d346b1897f8cbe9860d267", + "b621c6a8c0e9440fa840d75a1b1b02fc", + "434fe18d50a14920b30fd2d0650297ac", + "353bf45a4bbc46d6a798175f152399cb", + "bfcbfe4184774fd3a8320f4f0e1baf54", + "99c5c846cc5e43429905f071670b4310", + "8d988c86648244788f6dc5aa0fea38fd", + "740604526cc44cd58b811827d4787d96", + "2558c2dd7d394ecf9fc67a69ce8fc97a", + "a2a7b715b16a41a288209dee1de5d2d1", + "bf77e5aaab0547f7b2beb015687552ef", + "a2c543008f444cf49972a4f35c32b8e3", + "bb7b8a9e42f6478f851236685a1392d6", + "9cfaf17064bc49a5aded0fc53dd7cd7f", + "ede66e196fa9482498f58dcdffd494a2", + "a26cc7fea1a64d7bac1769d33cc74e28", + "c2f24a8930be4b70b4bbbcf5d908b01d", + "1f59dd66813f419999336e59a3efc56a", + "026072374b7d47c194707a50f5c99099", + "63ac7dafeb27446cb30aaddf4cd27c9f", + "854e35df771f470b82a59f878a2a6a46", + "d1cbe0ab9379453588eb438d13fd272d", + "b0b7457a8b47496483da1506fb2505b3", + "c7dc386d978a44ff885763ecec94dc38", + "4a7c8dfd88db4bc893da2bced0560d47", + "27a587021d854b79a279a510a55f9d73", + "4227474e986546d1a7d31dce35a2410c", + "1561cd47c42e46368677d34e7b7084cd", + "8b9e961c837a464fb7a8c44756dc41e7", + "d4092198673141d3b4a824d629d73f64", + "d3cbfd564fe8485ba7afdb1cc54abed3", + "7e51e8e0612e46b1a3403d448b39aa50", + "061c45266c484ff6807dcaf4722fd73b", + "04188e0cec0542818894ebc6a534fb51", + "4a13203d132b45beadf140c02dc8a566" + ] + }, + "id": "s6f4z8EYmcJ6", + "outputId": "8ece7a1a-cf27-4602-c70b-ce4f3d7e11bf" + }, + "outputs": [], + "source": [ + "# Load the dataset from HF\n", + "from datasets import load_dataset\n", + "\n", + "data = load_dataset(\"timdettmers/openassistant-guanaco\")\n", + "data = data.map(lambda samples: tokenizer(samples[\"text\"]), batched=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_0MOtwf3zdZp" + }, + "source": [ + "## Training\n", + "\n", + "For the sake of the demo, we just ran it for 10 steps just to showcase how to use this integration with existing tools on the HF ecosystem." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 498 + }, + "id": "jq0nX33BmfaC", + "outputId": "94e17005-065b-48ab-9192-e3ab55c0c292" + }, + "outputs": [], + "source": [ + "import transformers\n", + "\n", + "tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "trainer = transformers.Trainer(\n", + " model=model,\n", + " train_dataset=data[\"train\"],\n", + " args=transformers.TrainingArguments(\n", + " per_device_train_batch_size=1,\n", + " gradient_accumulation_steps=4,\n", + " warmup_steps=2,\n", + " max_steps=10,\n", + " learning_rate=2e-4,\n", + " fp16=True,\n", + " logging_steps=1,\n", + " output_dir=\"path/to/your/HF/repo\", # change it to your desired repo!\n", + " optim=\"paged_adamw_8bit\",\n", + " label_names=[\"labels\"],\n", + " ),\n", + " data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n", + ")\n", + "model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mr3rLrHwqhf6" + }, + "source": [ + "## Usage Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9mrOJ9l8SMHv" + }, + "outputs": [], + "source": [ + "model.config.use_cache = True\n", + "model.eval();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 122 + }, + "id": "AM6FNOFzqKfI", + "outputId": "fdbe28b1-e440-45d3-bd6d-c15e744ad23d" + }, + "outputs": [], + "source": [ + "from transformers import GenerationConfig\n", + "\n", + "max_new_tokens = 120\n", + "top_p = 0.9\n", + "temperature = 0.7\n", + "user_question = \"What is the purpose of quantization in LLMs?\"\n", + "\n", + "\n", + "prompt = (\n", + " \"A chat between a curious human and an artificial intelligence assistant. \"\n", + " \"The assistant gives helpful, detailed, and polite answers to the user's questions. \"\n", + " \"### Human: {user_question}\"\n", + " \"### Assistant: \"\n", + ")\n", + "\n", + "\n", + "def generate(model, user_question, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature):\n", + " device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + " inputs = tokenizer(prompt.format(user_question=user_question), return_tensors=\"pt\").to(device)\n", + "\n", + " outputs = model.generate(\n", + " **inputs,\n", + " generation_config=GenerationConfig(\n", + " do_sample=True,\n", + " max_new_tokens=max_new_tokens,\n", + " top_p=top_p,\n", + " temperature=temperature,\n", + " ),\n", + " )\n", + "\n", + " text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + " # print(text)\n", + " return text\n", + "\n", + "\n", + "generate(model, user_question)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "T5t_gl2_f5OO" + }, + "outputs": [], + "source": [ + "# trainer.push_to_hub()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00371a48e64c45cd97020a78b710e64c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3f9fa554747743f8a86b40a4f7530617", + "placeholder": "​", + "style": "IPY_MODEL_ffe561df8772443ebf40a3b8b656079f", + "value": " 50.6k/50.6k [00:00<00:00, 3.65MB/s]" + } + }, + "006b78b5191b4fb888d98bdf6c20ec1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee4e4af964ec4dd597cb04a90f0697f9", + "placeholder": "​", + "style": "IPY_MODEL_974e3687f18a4e1a975969b880d086aa", + "value": "Your token has been saved in your configured git credential helpers (store)." + } + }, + "026072374b7d47c194707a50f5c99099": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "02d6cc4c2717434c895798601bda7c86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "03dd6c24f6d94fe7ab85b79d6f6cbeaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cc5ce633746949ed98418cae9f68afe3", + "placeholder": "​", + "style": "IPY_MODEL_4b1f795c4c004cacbf3660d935e52995", + "value": "Downloading data: 100%" + } + }, + "04188e0cec0542818894ebc6a534fb51": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "061c45266c484ff6807dcaf4722fd73b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "068eb104d5d346b1897f8cbe9860d267": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0711e28e06a440c2a241acbc1f90d1e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f4ca7b63d7d749ff83a848e250f03ec1", + "placeholder": "​", + "style": "IPY_MODEL_8c149bc655a34fe5b91853c66db458a9", + "value": "model.safetensors.index.json: 100%" + } + }, + "076357d4bb9943bdaa1d6846897786af": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07e0aed682fd4cc88fa75c0592dc04a7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47944ad8cadf4a57b170193c46d4389c", + "placeholder": "​", + "style": "IPY_MODEL_db05b25cb38140bdb21e6f3b7fde7e66", + "value": "model-00001-of-00004.safetensors: 100%" + } + }, + "082b6990ce5e4812adc0ad6a7b376dac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0aad2d9d1cba40cbb64308ede3242ed7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0711e28e06a440c2a241acbc1f90d1e8", + "IPY_MODEL_77704d2e27e94cd3a0c5f6b5ceeffd1c", + "IPY_MODEL_3b82b8d41b134bec9bd77ed8d4f00eb4" + ], + "layout": "IPY_MODEL_25fef90e209f4b14a73f3e39d226d913" + } + }, + "0b145e421f4840f2872c29256b49f168": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0c4ac7c3db0b431397cc812f7c9e785c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_90661b333d6f496ca606b3046622660e", + "style": "IPY_MODEL_5f551f9b217e44cf8b5433f314b3844b", + "value": true + } + }, + "0d2ae3466a3447c58e23ccd2b3733deb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_921a1a037f7b47f8b57d1da8192a437a", + "placeholder": "​", + "style": "IPY_MODEL_892ff4e2f0e44c23bc5c2be7547cf0bd", + "value": "generation_config.json: 100%" + } + }, + "0e2beab611114239b6ee48a3cbb09c49": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57b0096985ab44aea342e52795c4f999", + "placeholder": "​", + "style": "IPY_MODEL_a4c404e420cc4ce781ce569f9ab3f987", + "value": "Token is valid (permission: write)." + } + }, + "0ec2643d9fd44785addb37d9ecd23989": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0f4c664612364dc89acf78eb1c740980": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0f54e8fda93144f6a95493e6ec535e9d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0f60f9aa76b941809e013ffcae83604a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "124a70bfad434c5c946f611c04a91c8f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "14bf612f6ad7416c8ddd6085c72eee0e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "14c73d88df9e46e3bbb6690fdb48ad07": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1561cd47c42e46368677d34e7b7084cd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_04188e0cec0542818894ebc6a534fb51", + "placeholder": "​", + "style": "IPY_MODEL_4a13203d132b45beadf140c02dc8a566", + "value": " 518/518 [00:00<00:00, 976.36 examples/s]" + } + }, + "156f95b0012449e8a0c604e6e03bf35f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "17c797e08bd2493fa685918129415309": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "191caf3a38eb4191a35f623ce25238f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1ae1d2702da5483a85504f59939ffa39": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8cd63d3908e4411c9fcb42bc32c8dd16", + "max": 73, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_64624b26145b42db82f7afc36c32e117", + "value": 73 + } + }, + "1b2abf90003e4165a3293acd6a5ea9ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cffdf12fbe97462ab74e88ccca943aeb", + "placeholder": "​", + "style": "IPY_MODEL_bcaf4c81ba9d437bb6223dbb22d011ed", + "value": " 73.0/73.0 [00:00<00:00, 4.75kB/s]" + } + }, + "1bd0a270c7ee409c970763398e54fc36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_076357d4bb9943bdaa1d6846897786af", + "placeholder": "​", + "style": "IPY_MODEL_7b3e136fc9e74a699497a947006f4f1d", + "value": " 1.11M/1.11M [00:00<00:00, 8.23MB/s]" + } + }, + "1d27ab2bc6ae463a806292b68b7891f8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f0efc167b3744b38ff832b71d529318": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_64911f0e52e74067a1a986c5edfc7f59", + "placeholder": "​", + "style": "IPY_MODEL_b4a274fc9e324b80bf559c4dbd05e319", + "value": "Downloading readme: 100%" + } + }, + "1f59dd66813f419999336e59a3efc56a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f75d85e6c7e4eb6a91b03f0c8adb644": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f82a5685eef4b47a2dbf7618362907c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "21bf14b771c14d2dab9e98a326302e14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "23012118a7314a3f838870a2aee9ec90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e297072ab5d64815b90bc89d22503378", + "placeholder": "​", + "style": "IPY_MODEL_67fbabb9082c4241b8f937b24e0cdd03", + "value": " 395/395 [00:00<00:00, 16.6kB/s]" + } + }, + "2540d57e3bf545e3812da1ee72b85fc8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2558c2dd7d394ecf9fc67a69ce8fc97a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "25fc6aaf37fc49fa822df29236bf2f90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "25fef90e209f4b14a73f3e39d226d913": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27a587021d854b79a279a510a55f9d73": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d4092198673141d3b4a824d629d73f64", + "placeholder": "​", + "style": "IPY_MODEL_d3cbfd564fe8485ba7afdb1cc54abed3", + "value": "Map: 100%" + } + }, + "2a57bb48e1c6475abba242994a79d44a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "2ab86b3fbd49488bb02f8205a572e752": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "30b74bd2db8d40d08408013cebcd7661": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "317cda72329c4043ab0b224b46b259d3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "31c10fa464e24f97b379675a204a09b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1d27ab2bc6ae463a806292b68b7891f8", + "max": 654, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2a57bb48e1c6475abba242994a79d44a", + "value": 654 + } + }, + "31c574113731403b88edc5bb0798bc6d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "347540dc03d34e65b7ffbb0f5fc569aa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "34e381adbd9242759b57f2a305c5d2e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "35186465f87341f683affb9399661540": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "353bf45a4bbc46d6a798175f152399cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_740604526cc44cd58b811827d4787d96", + "placeholder": "​", + "style": "IPY_MODEL_2558c2dd7d394ecf9fc67a69ce8fc97a", + "value": "Generating test split: 100%" + } + }, + "35c2c635c2024bcda3265bf95d330f63": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "37523a6cac1047e9a261698212d47737": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_191caf3a38eb4191a35f623ce25238f9", + "max": 20877686, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_30b74bd2db8d40d08408013cebcd7661", + "value": 20877686 + } + }, + "3b614b9712874fac990d2c557b0791a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3b82b8d41b134bec9bd77ed8d4f00eb4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_14bf612f6ad7416c8ddd6085c72eee0e", + "placeholder": "​", + "style": "IPY_MODEL_f7e59b47f9b74523843f37268212d566", + "value": " 23.9k/23.9k [00:00<00:00, 1.51MB/s]" + } + }, + "3b8bc5b9392e45758813a1db9db824a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d9d8278667d496aaea1eaaa4d24ae93": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3e45aea9f7444a4db885c4cca4c9c4ff": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f9fa554747743f8a86b40a4f7530617": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4227474e986546d1a7d31dce35a2410c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7e51e8e0612e46b1a3403d448b39aa50", + "max": 518, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_061c45266c484ff6807dcaf4722fd73b", + "value": 518 + } + }, + "42eb041021214110a860924d28d73409": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4c369386ba5f4862b11a50e50130663b", + "placeholder": "​", + "style": "IPY_MODEL_bbdf3bb657e64fc2b0a90e78e8886480", + "value": " 5.00G/5.00G [00:24<00:00, 249MB/s]" + } + }, + "434fe18d50a14920b30fd2d0650297ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_353bf45a4bbc46d6a798175f152399cb", + "IPY_MODEL_bfcbfe4184774fd3a8320f4f0e1baf54", + "IPY_MODEL_99c5c846cc5e43429905f071670b4310" + ], + "layout": "IPY_MODEL_8d988c86648244788f6dc5aa0fea38fd" + } + }, + "43d12a98d90a4bf7a96c033172c646e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c35b16156253402f90a432f3f07c2e0a", + "IPY_MODEL_1ae1d2702da5483a85504f59939ffa39", + "IPY_MODEL_1b2abf90003e4165a3293acd6a5ea9ff" + ], + "layout": "IPY_MODEL_3e45aea9f7444a4db885c4cca4c9c4ff" + } + }, + "47944ad8cadf4a57b170193c46d4389c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a13203d132b45beadf140c02dc8a566": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4a7c8dfd88db4bc893da2bced0560d47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_27a587021d854b79a279a510a55f9d73", + "IPY_MODEL_4227474e986546d1a7d31dce35a2410c", + "IPY_MODEL_1561cd47c42e46368677d34e7b7084cd" + ], + "layout": "IPY_MODEL_8b9e961c837a464fb7a8c44756dc41e7" + } + }, + "4b1f795c4c004cacbf3660d935e52995": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4bc1fd9d480a4799954c69031c071b30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_068eb104d5d346b1897f8cbe9860d267", + "placeholder": "​", + "style": "IPY_MODEL_b621c6a8c0e9440fa840d75a1b1b02fc", + "value": " 9846/9846 [00:00<00:00, 38881.08 examples/s]" + } + }, + "4bdb196cd1494f809829651ec5b6cbf8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4c369386ba5f4862b11a50e50130663b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4c8e98294bd240a6869cb199caee66e1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e1f5423311b4dc0930c21c9ad5a88f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "51180cce01564821a170d1d4b8a9a918": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_03dd6c24f6d94fe7ab85b79d6f6cbeaf", + "IPY_MODEL_f2ab2fa803e94328a237e84cd4ea0027", + "IPY_MODEL_1bd0a270c7ee409c970763398e54fc36" + ], + "layout": "IPY_MODEL_e92b30d0b4234af6b5a33bff989b1b45" + } + }, + "51b3af446ace409dbcdf5de499552061": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53f287d4927541d08e2ae7d4d0b3c396": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "546b76a22f1046cd856a8fa2f9ff2d9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5690d92586494b9187147f32fa708405": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57b0096985ab44aea342e52795c4f999": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57f251691b4c453896b2508c431dfc2f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "57f87d4780634d36ae8159d987c22993": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_58ea619f81bf42ddb8b166db3deb0e86", + "IPY_MODEL_8bb83ae3229e4f38b1733f92f536fad0", + "IPY_MODEL_5c0104210ee34ca8a072ee5121f424a1" + ], + "layout": "IPY_MODEL_34e381adbd9242759b57f2a305c5d2e3" + } + }, + "58ea619f81bf42ddb8b166db3deb0e86": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e4e1a4338c5e46b3ba5a3bb960da7107", + "placeholder": "​", + "style": "IPY_MODEL_9a5072b8d16d4a1eb0652da61bda0ac8", + "value": "Loading checkpoint shards: 100%" + } + }, + "5924b266e95a42039634a334ff561a82": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_eadeec171e7b4c0f9e26964f031cfb71", + "IPY_MODEL_feae525923d5407bb69a922954c474f7", + "IPY_MODEL_00371a48e64c45cd97020a78b710e64c" + ], + "layout": "IPY_MODEL_156f95b0012449e8a0c604e6e03bf35f" + } + }, + "5962e77eea5a4d88ba6dbc5e9f51c709": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5a8ac674153248999007a713299b2644": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5b2a671976fa446db408d58a215b8249": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_63899ac621ff4e9cb8e215d5ab63bef8", + "max": 4, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_6b2b59d2b62b4f7da8c60ff783138397", + "value": 4 + } + }, + "5b56ac3009714a5a84dd8749db4a7bce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_51b3af446ace409dbcdf5de499552061", + "placeholder": "​", + "style": "IPY_MODEL_9a12124915994b70a71ebd64b99e93e9", + "value": " 1.17G/1.17G [00:09<00:00, 45.8MB/s]" + } + }, + "5c0104210ee34ca8a072ee5121f424a1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_604582e8cbff4dc9876551a3307b5b77", + "placeholder": "​", + "style": "IPY_MODEL_a98165ee656643ad85ac9ea1447cc775", + "value": " 4/4 [01:13<00:00, 15.74s/it]" + } + }, + "5cf4a57d21a545029b6448258a5ebd84": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0d2ae3466a3447c58e23ccd2b3733deb", + "IPY_MODEL_ba7f32c41f9247ec9d4c40e6396b55a9", + "IPY_MODEL_ea1bdb5f2da64332960bccd967a84b4a" + ], + "layout": "IPY_MODEL_b08631e4cffa445c912da0c8eac2ef23" + } + }, + "5f551f9b217e44cf8b5433f314b3844b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5f60910d1e744432bdf87518f0f45874": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5f6ffa1d929443a5bd9c7c550f0690f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_93a50117ece543d4857ba02505dc4514", + "placeholder": "​", + "style": "IPY_MODEL_71a3a56edbdb45669d382fef4b097e1b", + "value": "Your token has been saved to /root/.cache/huggingface/token" + } + }, + "5fb4a4ef8afe4ea4af6655faea17f354": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "604582e8cbff4dc9876551a3307b5b77": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "610e1ddfb7a44d51a54ebea6dad3a5f0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6149752353fe4f9cbb7b26bcc25199a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6cb8065803724d80b82b06dc95ded91e", + "placeholder": "​", + "style": "IPY_MODEL_8301c6302df54bbc9f15295f11cec208", + "value": " 654/654 [00:00<00:00, 46.9kB/s]" + } + }, + "63899ac621ff4e9cb8e215d5ab63bef8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "63ac7dafeb27446cb30aaddf4cd27c9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "64624b26145b42db82f7afc36c32e117": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "64911f0e52e74067a1a986c5edfc7f59": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "661f76474252493caae8f7d6aa8f99b7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_83c355e1418140a5bbad11bf0646b332", + "placeholder": "​", + "style": "IPY_MODEL_84d6d2a6afcd423f9b609cbb2d10f00e", + "value": " 20.9M/20.9M [00:00<00:00, 44.7MB/s]" + } + }, + "668a7f88506148a9ba2b48920afc028f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_53f287d4927541d08e2ae7d4d0b3c396", + "placeholder": "​", + "style": "IPY_MODEL_afa442ab223b46cb82569438c0047823", + "value": "Login successful" + } + }, + "67b4473eb8a44a96ba34983762ab38fa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6f474268da0f4337a2ccecc1ca2098a1", + "max": 4976698672, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c2ceccfdb59b4336a24003cd6bc2403d", + "value": 4976698672 + } + }, + "67fbabb9082c4241b8f937b24e0cdd03": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6b0ec8d5f7294d44a5fa15d8ef12471e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8ea89e52123643268857285e0e1db1c0", + "placeholder": "​", + "style": "IPY_MODEL_a8514e34378d47a28fbf0831a14ede8f", + "value": "tokenizer.json: 100%" + } + }, + "6b2b59d2b62b4f7da8c60ff783138397": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "6b6ed29053ec4aaa8fc5526a35f17c2b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6bb9c7182d2a464ea21809e59043562a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6cb8065803724d80b82b06dc95ded91e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6f474268da0f4337a2ccecc1ca2098a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7037c32dfce84e70ac86537dbbc6a495": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f4c664612364dc89acf78eb1c740980", + "placeholder": "​", + "style": "IPY_MODEL_0f60f9aa76b941809e013ffcae83604a", + "value": "config.json: 100%" + } + }, + "71a3a56edbdb45669d382fef4b097e1b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7243d8e2e1cc4043a2ee310eabd0ac09": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "740604526cc44cd58b811827d4787d96": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7504986b8d8d4d0da58ad79e80a81948": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7725e9d443e249ada02e5ac7056d00db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "77704d2e27e94cd3a0c5f6b5ceeffd1c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0f54e8fda93144f6a95493e6ec535e9d", + "max": 23950, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_880124db7dc04aaea09edd75e1ec7921", + "value": 23950 + } + }, + "791df472db174df69b8c9f0e200af254": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a00aa4a97a34da39cc052c6926dbe13": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ac8e88f29f04b859f592a003d39836b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b3e136fc9e74a699497a947006f4f1d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7cd50bcc8fcc4b83abcda6d3604bd4cc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "LabelModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "LabelModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "LabelView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a00aa4a97a34da39cc052c6926dbe13", + "placeholder": "​", + "style": "IPY_MODEL_14c73d88df9e46e3bbb6690fdb48ad07", + "value": "Connecting..." + } + }, + "7d3a7be9ed6f48988a2c4a1a4a2271cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f86b969ef69b48119619e1a424b50460", + "max": 9085698, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7725e9d443e249ada02e5ac7056d00db", + "value": 9085698 + } + }, + "7e2e097c703a4a0d8556733a0739469c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e0fd6d00f0ba4e59bdaa5779556ec4ea", + "IPY_MODEL_cd318c6bfc8e421a9bfcdab16be5eaa7", + "IPY_MODEL_4bc1fd9d480a4799954c69031c071b30" + ], + "layout": "IPY_MODEL_25fc6aaf37fc49fa822df29236bf2f90" + } + }, + "7e3a386e672f4748882211227b7721a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "7e51e8e0612e46b1a3403d448b39aa50": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "800e9453214848b69bc4c6ca2d5e8f79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6b0ec8d5f7294d44a5fa15d8ef12471e", + "IPY_MODEL_7d3a7be9ed6f48988a2c4a1a4a2271cf", + "IPY_MODEL_c40f583823574e40b6b29d4914143c0e" + ], + "layout": "IPY_MODEL_a4368e6da8f046aaa32f3152b7d333d1" + } + }, + "8301c6302df54bbc9f15295f11cec208": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8365680c634a44aa880317e36fa5e46e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_31c574113731403b88edc5bb0798bc6d", + "placeholder": "​", + "style": "IPY_MODEL_3b8bc5b9392e45758813a1db9db824a9", + "value": "" + } + }, + "83c355e1418140a5bbad11bf0646b332": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "849cdc1912aa4df4b0c721a8c63ca0f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "84d6d2a6afcd423f9b609cbb2d10f00e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "854e35df771f470b82a59f878a2a6a46": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8800c351b6da450eace0c3890d36c8d7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7037c32dfce84e70ac86537dbbc6a495", + "IPY_MODEL_31c10fa464e24f97b379675a204a09b5", + "IPY_MODEL_6149752353fe4f9cbb7b26bcc25199a9" + ], + "layout": "IPY_MODEL_0b145e421f4840f2872c29256b49f168" + } + }, + "880124db7dc04aaea09edd75e1ec7921": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "88024cd312ee42c2925ebfbe52077780": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "892ff4e2f0e44c23bc5c2be7547cf0bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8a7c82dcbd414b24b67ccfbc562b2e38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8b9e961c837a464fb7a8c44756dc41e7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8bb83ae3229e4f38b1733f92f536fad0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1f75d85e6c7e4eb6a91b03f0c8adb644", + "max": 4, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9da33f07ea354b5798e85298e132b017", + "value": 4 + } + }, + "8c149bc655a34fe5b91853c66db458a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c4d6f4eea3742289a2604e66b0c6182": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8cc86330c2af436c9af314e8c04c8c2b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0e2beab611114239b6ee48a3cbb09c49", + "IPY_MODEL_006b78b5191b4fb888d98bdf6c20ec1e", + "IPY_MODEL_5f6ffa1d929443a5bd9c7c550f0690f0", + "IPY_MODEL_668a7f88506148a9ba2b48920afc028f" + ], + "layout": "IPY_MODEL_35186465f87341f683affb9399661540" + } + }, + "8cd63d3908e4411c9fcb42bc32c8dd16": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8d0f1d547c384094b10aa00a3ede3c06": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_082b6990ce5e4812adc0ad6a7b376dac", + "placeholder": "​", + "style": "IPY_MODEL_610e1ddfb7a44d51a54ebea6dad3a5f0", + "value": "model-00003-of-00004.safetensors: 100%" + } + }, + "8d988c86648244788f6dc5aa0fea38fd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8ea89e52123643268857285e0e1db1c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8f5b8c513b164dab9e0892422163c483": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cd11fb7d54bb43ae821f2272d075a1b3", + "placeholder": "​", + "style": "IPY_MODEL_fbc6a2834c5442fbb6667f1b3612bb5b", + "value": "Downloading shards: 100%" + } + }, + "90661b333d6f496ca606b3046622660e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "921a1a037f7b47f8b57d1da8192a437a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "93a50117ece543d4857ba02505dc4514": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "974e3687f18a4e1a975969b880d086aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "99529129d7f0435da0fdcfc9803a2f11": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5a8ac674153248999007a713299b2644", + "max": 1168138808, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1f82a5685eef4b47a2dbf7618362907c", + "value": 1168138808 + } + }, + "99c5c846cc5e43429905f071670b4310": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2c543008f444cf49972a4f35c32b8e3", + "placeholder": "​", + "style": "IPY_MODEL_bb7b8a9e42f6478f851236685a1392d6", + "value": " 518/518 [00:00<00:00, 13408.85 examples/s]" + } + }, + "9a0b012915c54abeb100f466fa99d303": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fffbf696c07744fc8e3d81ab51dc9c90", + "placeholder": "​", + "style": "IPY_MODEL_a153cc3ca0cc45c18a941bd57e363ec3", + "value": "model-00004-of-00004.safetensors: 100%" + } + }, + "9a12124915994b70a71ebd64b99e93e9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a5072b8d16d4a1eb0652da61bda0ac8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9bdebf06b6874bbb88404f4ad14e1dbc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f60910d1e744432bdf87518f0f45874", + "max": 4915916176, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2ab86b3fbd49488bb02f8205a572e752", + "value": 4915916176 + } + }, + "9cfaf17064bc49a5aded0fc53dd7cd7f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ede66e196fa9482498f58dcdffd494a2", + "IPY_MODEL_a26cc7fea1a64d7bac1769d33cc74e28", + "IPY_MODEL_c2f24a8930be4b70b4bbbcf5d908b01d" + ], + "layout": "IPY_MODEL_1f59dd66813f419999336e59a3efc56a" + } + }, + "9da33f07ea354b5798e85298e132b017": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9f13437a44b8434b9cc3afab998e8d3c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a06b2bd0236249999adffa44e53cf80e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5fb4a4ef8afe4ea4af6655faea17f354", + "max": 395, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_b1a03a5e9bae46129830daeeb23bf6ff", + "value": 395 + } + }, + "a153cc3ca0cc45c18a941bd57e363ec3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a2249f364b914662b54045a1f8d6dfd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9f13437a44b8434b9cc3afab998e8d3c", + "placeholder": "​", + "style": "IPY_MODEL_8a7c82dcbd414b24b67ccfbc562b2e38", + "value": " 4.92G/4.92G [00:32<00:00, 171MB/s]" + } + }, + "a26cc7fea1a64d7bac1769d33cc74e28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_854e35df771f470b82a59f878a2a6a46", + "max": 9846, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d1cbe0ab9379453588eb438d13fd272d", + "value": 9846 + } + }, + "a2a7b715b16a41a288209dee1de5d2d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2c543008f444cf49972a4f35c32b8e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a34b3fd5859a441f89cbe7f6e6df9da9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8f5b8c513b164dab9e0892422163c483", + "IPY_MODEL_5b2a671976fa446db408d58a215b8249", + "IPY_MODEL_c934919f617447cfb9226929e7a68d79" + ], + "layout": "IPY_MODEL_124a70bfad434c5c946f611c04a91c8f" + } + }, + "a4368e6da8f046aaa32f3152b7d333d1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4c404e420cc4ce781ce569f9ab3f987": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a53b4776f95f4dd38197193e6c5f649e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aea74071600f483b9e6de1a61743c03a", + "placeholder": "​", + "style": "IPY_MODEL_21bf14b771c14d2dab9e98a326302e14", + "value": "model-00002-of-00004.safetensors: 100%" + } + }, + "a8514e34378d47a28fbf0831a14ede8f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a8999d04e4114693bb6be358bdbe9b83": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a98165ee656643ad85ac9ea1447cc775": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "aea74071600f483b9e6de1a61743c03a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "afa442ab223b46cb82569438c0047823": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b08631e4cffa445c912da0c8eac2ef23": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b0b7457a8b47496483da1506fb2505b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1a03a5e9bae46129830daeeb23bf6ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "b1de7b283eeb41828e8093e60c83f2c4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_bb640a5c858349d29c13ce5629e72f22", + "IPY_MODEL_37523a6cac1047e9a261698212d47737", + "IPY_MODEL_661f76474252493caae8f7d6aa8f99b7" + ], + "layout": "IPY_MODEL_849cdc1912aa4df4b0c721a8c63ca0f9" + } + }, + "b2a19b6092c44b20886987b30f1bf48a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1f0efc167b3744b38ff832b71d529318", + "IPY_MODEL_a06b2bd0236249999adffa44e53cf80e", + "IPY_MODEL_23012118a7314a3f838870a2aee9ec90" + ], + "layout": "IPY_MODEL_dcff079d850c423a83eb70105b816ee4" + } + }, + "b3b3f4ddd4ed4d938c923887939a0440": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_57f251691b4c453896b2508c431dfc2f", + "placeholder": "​", + "style": "IPY_MODEL_4bdb196cd1494f809829651ec5b6cbf8", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "b4a274fc9e324b80bf559c4dbd05e319": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b4ba435f6d1c448f99b533bc6df32e76": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_35c2c635c2024bcda3265bf95d330f63", + "max": 4999802720, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ec014d847e394a309b6a82c30a6fdfc5", + "value": 4999802720 + } + }, + "b621c6a8c0e9440fa840d75a1b1b02fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ba7f32c41f9247ec9d4c40e6396b55a9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4c8e98294bd240a6869cb199caee66e1", + "max": 177, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4e1f5423311b4dc0930c21c9ad5a88f5", + "value": 177 + } + }, + "bac377ed96ae4e8db9b298bb623888ec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bb640a5c858349d29c13ce5629e72f22": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5962e77eea5a4d88ba6dbc5e9f51c709", + "placeholder": "​", + "style": "IPY_MODEL_f07c8a6ec12f46ea9e32a2208e70bccd", + "value": "Downloading data: 100%" + } + }, + "bb7b8a9e42f6478f851236685a1392d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bbdf3bb657e64fc2b0a90e78e8886480": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bcaf4c81ba9d437bb6223dbb22d011ed": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bf77e5aaab0547f7b2beb015687552ef": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "bfcbfe4184774fd3a8320f4f0e1baf54": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2a7b715b16a41a288209dee1de5d2d1", + "max": 518, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf77e5aaab0547f7b2beb015687552ef", + "value": 518 + } + }, + "c27e8ce031884a90b41d8220b1870bc4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c2ceccfdb59b4336a24003cd6bc2403d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c2f24a8930be4b70b4bbbcf5d908b01d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b0b7457a8b47496483da1506fb2505b3", + "placeholder": "​", + "style": "IPY_MODEL_c7dc386d978a44ff885763ecec94dc38", + "value": " 9846/9846 [00:09<00:00, 1066.17 examples/s]" + } + }, + "c35b16156253402f90a432f3f07c2e0a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6b6ed29053ec4aaa8fc5526a35f17c2b", + "placeholder": "​", + "style": "IPY_MODEL_e69cd88ccbae4bb7b238fa112a60f0f9", + "value": "special_tokens_map.json: 100%" + } + }, + "c40f583823574e40b6b29d4914143c0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f81756eb9e554899b0778311f2c407c4", + "placeholder": "​", + "style": "IPY_MODEL_3b614b9712874fac990d2c557b0791a6", + "value": " 9.09M/9.09M [00:00<00:00, 19.3MB/s]" + } + }, + "c56d8289513441688f9bc5f4b52d60a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a53b4776f95f4dd38197193e6c5f649e", + "IPY_MODEL_b4ba435f6d1c448f99b533bc6df32e76", + "IPY_MODEL_42eb041021214110a860924d28d73409" + ], + "layout": "IPY_MODEL_17c797e08bd2493fa685918129415309" + } + }, + "c7dc386d978a44ff885763ecec94dc38": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c7e06fd82f7f4f9fb81c68e8758f2de1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8999d04e4114693bb6be358bdbe9b83", + "placeholder": "​", + "style": "IPY_MODEL_2540d57e3bf545e3812da1ee72b85fc8", + "value": " 4.98G/4.98G [00:34<00:00, 232MB/s]" + } + }, + "c84f542c863043dea8a3675fa153e78d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_d2d81cc8296c4b10bf80b86c0a3302d3", + "style": "IPY_MODEL_7e3a386e672f4748882211227b7721a9", + "tooltip": "" + } + }, + "c934919f617447cfb9226929e7a68d79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c27e8ce031884a90b41d8220b1870bc4", + "placeholder": "​", + "style": "IPY_MODEL_88024cd312ee42c2925ebfbe52077780", + "value": " 4/4 [01:41<00:00, 22.30s/it]" + } + }, + "cc5ce633746949ed98418cae9f68afe3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd11fb7d54bb43ae821f2272d075a1b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cd318c6bfc8e421a9bfcdab16be5eaa7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d10ba011d05045b18bbfeb9660e4d9d3", + "max": 9846, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_e56c22f77c884caaacfafd48dfa51a55", + "value": 9846 + } + }, + "cf6d1be81b6c4ffc81ce8fdabfc5ad28": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_07e0aed682fd4cc88fa75c0592dc04a7", + "IPY_MODEL_67b4473eb8a44a96ba34983762ab38fa", + "IPY_MODEL_c7e06fd82f7f4f9fb81c68e8758f2de1" + ], + "layout": "IPY_MODEL_3d9d8278667d496aaea1eaaa4d24ae93" + } + }, + "cffdf12fbe97462ab74e88ccca943aeb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d10ba011d05045b18bbfeb9660e4d9d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d1508f5cde9a43d8abc26dd2d0c34dbd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9a0b012915c54abeb100f466fa99d303", + "IPY_MODEL_99529129d7f0435da0fdcfc9803a2f11", + "IPY_MODEL_5b56ac3009714a5a84dd8749db4a7bce" + ], + "layout": "IPY_MODEL_546b76a22f1046cd856a8fa2f9ff2d9f" + } + }, + "d1cbe0ab9379453588eb438d13fd272d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "d2d81cc8296c4b10bf80b86c0a3302d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d3cbfd564fe8485ba7afdb1cc54abed3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d4092198673141d3b4a824d629d73f64": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d7ef74cf4a914ad38a69c84c34fff393": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_8d0f1d547c384094b10aa00a3ede3c06", + "IPY_MODEL_9bdebf06b6874bbb88404f4ad14e1dbc", + "IPY_MODEL_a2249f364b914662b54045a1f8d6dfd1" + ], + "layout": "IPY_MODEL_7504986b8d8d4d0da58ad79e80a81948" + } + }, + "db05b25cb38140bdb21e6f3b7fde7e66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dcff079d850c423a83eb70105b816ee4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "de3757d6125a4c07b502dd60816bafec": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0fd6d00f0ba4e59bdaa5779556ec4ea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ac8e88f29f04b859f592a003d39836b", + "placeholder": "​", + "style": "IPY_MODEL_0ec2643d9fd44785addb37d9ecd23989", + "value": "Generating train split: 100%" + } + }, + "e25f9ca445b14e3f8397779df071dfb4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_791df472db174df69b8c9f0e200af254", + "placeholder": "​", + "style": "IPY_MODEL_6bb9c7182d2a464ea21809e59043562a", + "value": "


Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "e297072ab5d64815b90bc89d22503378": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4e1a4338c5e46b3ba5a3bb960da7107": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e56c22f77c884caaacfafd48dfa51a55": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e69cd88ccbae4bb7b238fa112a60f0f9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e92b30d0b4234af6b5a33bff989b1b45": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ea1bdb5f2da64332960bccd967a84b4a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_347540dc03d34e65b7ffbb0f5fc569aa", + "placeholder": "​", + "style": "IPY_MODEL_7243d8e2e1cc4043a2ee310eabd0ac09", + "value": " 177/177 [00:00<00:00, 11.4kB/s]" + } + }, + "eadeec171e7b4c0f9e26964f031cfb71": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_de3757d6125a4c07b502dd60816bafec", + "placeholder": "​", + "style": "IPY_MODEL_8c4d6f4eea3742289a2604e66b0c6182", + "value": "tokenizer_config.json: 100%" + } + }, + "ec014d847e394a309b6a82c30a6fdfc5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ede66e196fa9482498f58dcdffd494a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_026072374b7d47c194707a50f5c99099", + "placeholder": "​", + "style": "IPY_MODEL_63ac7dafeb27446cb30aaddf4cd27c9f", + "value": "Map: 100%" + } + }, + "ee4e4af964ec4dd597cb04a90f0697f9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f07c8a6ec12f46ea9e32a2208e70bccd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2ab2fa803e94328a237e84cd4ea0027": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5690d92586494b9187147f32fa708405", + "max": 1105272, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_317cda72329c4043ab0b224b46b259d3", + "value": 1105272 + } + }, + "f4ca7b63d7d749ff83a848e250f03ec1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7e59b47f9b74523843f37268212d566": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f81756eb9e554899b0778311f2c407c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f86b969ef69b48119619e1a424b50460": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fbc6a2834c5442fbb6667f1b3612bb5b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "feae525923d5407bb69a922954c474f7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bac377ed96ae4e8db9b298bb623888ec", + "max": 50566, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_02d6cc4c2717434c895798601bda7c86", + "value": 50566 + } + }, + "ffe561df8772443ebf40a3b8b656079f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fffbf696c07744fc8e3d81ab51dc9c90": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/peft/examples/randlora_finetuning/randlora_finetuning.py b/peft/examples/randlora_finetuning/randlora_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..d6b4d8a24ad86e595b03bdda956265d98855cf47 --- /dev/null +++ b/peft/examples/randlora_finetuning/randlora_finetuning.py @@ -0,0 +1,230 @@ +# This script is based on examples/dora_finetuning/dora_finetuning.py +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, RandLoraConfig, get_peft_model, prepare_model_for_kbit_training + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + use_lora: bool, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + rank: int, + randlora_alpha: int, + randlora_dropout: float, + randlora_target_modules: str, + hub_model_id: str, + push_to_hub: bool, + sparse: bool, + very_sparse: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + # Setup device + device = torch.device(device) + print(f"Using device: {device}") + + # load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + + # Compute type + device_type = device.type + device_module = getattr(torch, device_type, torch.cuda) + bf16_suppotrted = device_module.is_available() and device_module.is_bf16_supported() + torch_dtype = torch.bfloat16 if bf16_suppotrted else torch.float16 + + # QRandLora (quantized randlora): IF YOU WANNA QUANTIZE THE MODEL + if quantize: + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16 if bf16_suppotrted else torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + torch_dtype=torch_dtype, + ) + # setup for quantized training + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained( + base_model, + torch_dtype=torch_dtype, + token=hf_token, + ) + # LoRa config for the PEFT model + if use_lora: + peft_config = LoraConfig( + r=rank, # Rank of matrix + lora_alpha=randlora_alpha, + target_modules=(randlora_target_modules.split(",") if randlora_target_modules else ["k_proj", "v_proj"]), + lora_dropout=randlora_dropout, + bias="none", + ) + else: + peft_config = RandLoraConfig( + r=rank, # Rank of random bases + randlora_alpha=randlora_alpha, + target_modules=(randlora_target_modules.split(",") if randlora_target_modules else ["k_proj", "v_proj"]), + randlora_dropout=randlora_dropout, + bias="none", + sparse=sparse, + very_sparse=very_sparse, + ) + + # get the peft model with RandLora config + model = get_peft_model(model, peft_config) + + model.to(device) # MODEL TO ACCELERATOR + tokenizer.pad_token = tokenizer.eos_token + + # Load the dataset + dataset = load_dataset(data_path) + + def tokenize_function(examples): + inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len) + inputs["labels"] = inputs["input_ids"].copy() # setting labels for a language modeling task + return inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Compute the total amount of training step for warmup + max_steps = int((len(dataset) // batch_size) * num_epochs) + + # Define training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=int(max_steps * 0.1), # 10% of total trainig steps + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16 + // batch_size, # Maintaining a minimum batch size of 16 post accumulation is recommended to ensure good performance + learning_rate=learning_rate, + hub_token=hf_token, + label_names=["labels"], + ) + + # Clear accelerator cache to free memory + device_module.empty_cache() + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + # Start model training + trainer.train() + + # Save and push the trained model and tokenizer + if push_to_hub: + # Push the main model to the hub + trainer.push_to_hub(commit_message="Fine-tuned model") + + # Save the model and tokenizer locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune LLaMA with DoRA and PEFT") + parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument("--use_lora", action="store_true", help="Apply Lora instead of RandLora") + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="auto", help="Device to use for training") + parser.add_argument("--rank", type=int, default=32, help="RandLora basis rank") + parser.add_argument("--randlora_alpha", type=int, default=640, help="RandLora alpha") + parser.add_argument("--randlora_dropout", type=float, default=0.05, help="RandLora dropout rate") + parser.add_argument( + "--randlora_target_modules", type=str, default=None, help="Comma-separated list of target modules for RandLora" + ) + parser.add_argument("--sparse", action="store_true", help="Use sparse matrix multiplication") + parser.add_argument("--very_sparse", action="store_true", help="Use very sparse matrix multiplication") + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + + if args.device == "auto": + args.device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + use_lora=args.use_lora, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + rank=args.rank, + randlora_alpha=args.randlora_alpha, + randlora_dropout=args.randlora_dropout, + randlora_target_modules=args.randlora_target_modules, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + sparse=args.sparse, + very_sparse=args.very_sparse, + ) diff --git a/peft/examples/road_finetuning/README.md b/peft/examples/road_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b9ce14017cf8dbd06e3d8d142416352ff514c7c0 --- /dev/null +++ b/peft/examples/road_finetuning/README.md @@ -0,0 +1,88 @@ +# RoAd: 3-in-1: 2D Rotary Adaptation for Efficient Finetuning, Efficient Batching and Composability + + +## Introduction + +[RoAd](https://arxiv.org/pdf/2409.00119) is a novel method that adapts LLMs using simple 2D rotations. It is highly parameter-efficient, +achieving strong performance with less than 0.1% trainable parameters. +RoAd also supports efficient serving of mixed-adapter requests within a batch, incurring only element-wise computation overhead rather than costly batch matrix multiplications. +Additionally, it improves model interpretability through structured and composable transformations. + +## Quick start +```python +import torch +from peft import RoadConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", device_map="cuda") +tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") +dataset = load_dataset("timdettmers/openassistant-guanaco", split="train") +road_config = RoadConfig( + variant="1", +) +peft_model = get_peft_model(model, road_config) +trainer = transformers.Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + tokenizer=tokenizer, +) +trainer.train() +peft_model.save_pretrained("road-llama-3-8b") +``` + +RoAd requires a higher learning rate compared to LoRa and similar approaches, set it to around 1e-3. + +Run the finetuning script simply by running: + +```bash +python examples/road_finetuning/road_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco +``` + +RoAd also supports quantization. To use 4-bit quantization try: + +```bash +python examples/road_finetuning/road_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --quantize +``` + +### Full example of the script +```bash +python road_finetuning.py \ + --base_model "PATH_TO_MODEL" \ + --data_path "PATH_TO_DATASET" \ + --output_dir "PATH_TO_OUTPUT_DIR" \ + --batch_size 1 \ + --num_epochs 3 \ + --learning_rate 1e-3 \ + --cutoff_len 512 \ + --val_set_size 500 \ + --quantize \ + --eval_step 10 \ + --save_step 100 \ + --device "cuda:0" \ + --variant 1 \ + --road_target_modules "q_proj,k_proj,v_proj,o_proj" \ + --hub_model_id "YOUR_HF_REPO" \ + --push_to_hub +``` +## Use the model on 🤗 +You can load and use the model as any other 🤗 models. +```python +from transformers import AutoModel +model = AutoModel.from_pretrained("ppetrushkov/llama-2-7b-sql-road-test") +``` + + +## Citation +``` +@inproceedings{ + liao2024in, + title={3-in-1: 2D Rotary Adaptation for Efficient Finetuning, Efficient Batching and Composability}, + author={Baohao Liao and Christof Monz}, + booktitle={The Thirty-eighth Annual Conference on Neural Information Processing Systems}, + year={2024}, + url={https://openreview.net/forum?id=rYjYwuM6yH} +} +``` diff --git a/peft/examples/road_finetuning/road_finetuning.py b/peft/examples/road_finetuning/road_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..0469785db44a15f622962bc3da8447390c511290 --- /dev/null +++ b/peft/examples/road_finetuning/road_finetuning.py @@ -0,0 +1,203 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import RoadConfig, get_peft_model, prepare_model_for_kbit_training + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + variant: str, + road_target_modules: str, + hub_model_id: str, + push_to_hub: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + # Setup device + device = torch.device(device) + print(f"Using device: {device}") + + # load tokenizer + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + + # IF YOU WANNA QUANTIZE THE MODEL + if quantize: + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=( + torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 + ), + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + ) + # setup for quantized training + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token, device_map="auto") + # RoAd config for the PEFT model + road_config = RoadConfig( + variant=variant, # Rank of matrix + target_modules=( + road_target_modules.split(",") + if road_target_modules + else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + ), + ) + + # get the peft model with RoAd config + model = get_peft_model(model, road_config) + + model.to(device) # MODEL TO GPU/CUDA + tokenizer.pad_token = tokenizer.eos_token + + # Load the dataset + dataset = load_dataset(data_path) + + def tokenize_function(examples): + inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=cutoff_len) + inputs["labels"] = inputs["input_ids"].copy() # setting labels for a language modeling task + return inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) + + # Define training arguments + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + ) + + # Clear CUDA cache to free memory + torch.cuda.empty_cache() + + # Initialize the Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], + data_collator=data_collator, + ) + + # Start model training + trainer.train() + + # Save and push the trained model and tokenizer + if push_to_hub: + # Push the main model to the hub + trainer.push_to_hub(commit_message="Fine-tuned model") + + # Save the model and tokenizer locally + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune LLaMA with DoRA and PEFT") + parser.add_argument("--base_model", type=str, default="huggyllama/llama-7b", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=3e-3, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training") + parser.add_argument( + "--variant", type=str, default="road_1", choices=["road_1", "road_2", "road_4"], help="RoAD variant" + ) + parser.add_argument( + "--road_target_modules", type=str, default=None, help="Comma-separated list of target modules for RoAd" + ) + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + variant=args.variant, + road_target_modules=args.road_target_modules, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + ) diff --git a/peft/examples/semantic_segmentation/README.md b/peft/examples/semantic_segmentation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fa23cb02cd6cca1f6e2595e0091dbd912c655ece --- /dev/null +++ b/peft/examples/semantic_segmentation/README.md @@ -0,0 +1,7 @@ +# Fine-tuning for semantic segmentation using LoRA and 🤗 PEFT + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb) + +We provide a notebook (`semantic_segmentation_peft_lora.ipynb`) where we learn how to use [LoRA](https://huggingface.co/papers/2106.09685) from 🤗 PEFT to fine-tune an semantic segmentation by ONLY using **14%%** of the original trainable parameters of the model. + +LoRA adds low-rank "update matrices" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://huggingface.co/papers/2106.09685). diff --git a/peft/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb b/peft/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e21503279d32fa25fc2fef95bef29c38d8431ac9 --- /dev/null +++ b/peft/examples/semantic_segmentation/semantic_segmentation_peft_lora.ipynb @@ -0,0 +1,1556 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "JAeWcsvLF2_6" + }, + "source": [ + "## Introduction\n", + "\n", + "In this notebook, we will learn how to use [LoRA](https://huggingface.co/papers/2106.09685) from 🤗 PEFT to fine-tune a SegFormer model variant for semantic segmentation by ONLY using **14%** of the original trainable parameters of the model. \n", + "\n", + "LoRA adds low-rank \"update matrices\" to certain blocks in the underlying model (in this case the attention blocks) and ONLY trains those matrices during fine-tuning. During inference, these update matrices are _merged_ with the original model parameters. For more details, check out the [original LoRA paper](https://huggingface.co/papers/2106.09685). \n", + "\n", + "Let's get started by installing the dependencies. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lveGHtBcGNyc" + }, + "source": [ + "## Install dependencies\n", + "\n", + "Here we're installing `peft` from source to ensure we have access to all the bleeding edge features of `peft`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lbYTKXv4ZTwg", + "outputId": "5a033ebd-6bbd-4bf4-802c-a1bac6a48a07" + }, + "outputs": [], + "source": [ + "!pip install transformers accelerate evaluate datasets==3.6.0 git+https://github.com/huggingface/peft -q" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B0fmCvTsGPah" + }, + "source": [ + "## Authentication\n", + "\n", + "We will share our fine-tuned model at the end of training. So, to do that we just authenticate using our 🤗 token. This token is available from [here](https://huggingface.co/settings/tokens). If you don't have a 🤗 account already, we highly encourage you to do so; it's free!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 331, + "referenced_widgets": [ + "f2a722f371904cce80dc1c087b153ad6", + "6c88a55a635b4c9f946a1aa838d69f20", + "6c6d19cd893e4d82bae9972fa10c6d74", + "fc48ee28c2e44f1daa03149c8004c314", + "cb4053f102fc4207a1c9513f81ad6415", + "0dcc5a2866a349e0843673bef499dc66", + "b7431f99d93b4e9b8c8177ac4a7b4070", + "14ac809ba0bc4cd5bcd51f83105947b0", + "e7393e78f41b496495982490b72ef2a3", + "42a7b8268d8945b5bbe9d3f20bc8840c", + "554b55f29a5e4d5a81608d912d3635e8", + "3f69c7b15e5e48039777e4b6b1a51f53", + "5bcfe3da0ffb41ccbb9404ac35ae8945", + "21bf954f54db41e6ba78f76195721614", + "f83dd354396e4aa3acd214a6fd98efb2", + "24fde588dc1a49a397e379cda320ed71", + "9746875e74b845daab06619393b4d46b" + ] + }, + "id": "OYhwMOj5ZTwm", + "outputId": "ff2d4cc4-4363-4093-8bdc-763761cbe3ef" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "B9Cu7j_QGVbH" + }, + "source": [ + "## Load a dataset\n", + "\n", + "We're only loading the first 150 instances from the training set of the [SceneParse150 dataset](https://huggingface.co/datasets/scene_parse_150) to keep this example runtime short. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sGJWDwtHZTwn", + "outputId": "260c874e-1844-42ba-9dc2-0f727e2930cc" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"scene_parse_150\", split=\"train[:150]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RpSPx8EHGeLM" + }, + "source": [ + "## Prepare train and test splits" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "ydWKIqCUZTwo" + }, + "outputs": [], + "source": [ + "ds = ds.train_test_split(test_size=0.1)\n", + "train_ds = ds[\"train\"]\n", + "test_ds = ds[\"test\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yHtqAQ2WGhlR" + }, + "source": [ + "## Prepare label mappers\n", + "\n", + "We create two dictionaries:\n", + "\n", + "* `label2id`: maps the semantic classes of the dataset to integer ids.\n", + "* `id2label`: `label2id` reversed. " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Hu8Y4dEIZTwq", + "outputId": "eba72235-c1a7-4c95-8c89-5ef5588d6581" + }, + "outputs": [], + "source": [ + "import json\n", + "from huggingface_hub import hf_hub_download\n", + "\n", + "repo_id = \"huggingface/label-files\"\n", + "filename = \"ade20k-id2label.json\"\n", + "id2label = json.load(open(hf_hub_download(repo_id=repo_id, filename=filename, repo_type=\"dataset\"), \"r\"))\n", + "id2label = {int(k): v for k, v in id2label.items()}\n", + "label2id = {v: k for k, v in id2label.items()}\n", + "num_labels = len(id2label)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5V8nhdt0HBsk" + }, + "source": [ + "## Prepare datasets for training and evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fNi_TKYpZTwq", + "outputId": "a28647b4-0deb-49cc-a1b8-c4a2ab99bb4d" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/image_processing_base.py:412: UserWarning: The following named arguments are not valid for `SegformerImageProcessor.__init__` and were ignored: 'reduce_labels'\n", + " image_processor = cls(**image_processor_dict)\n" + ] + } + ], + "source": [ + "from transformers import AutoImageProcessor\n", + "\n", + "checkpoint = \"nvidia/mit-b0\"\n", + "image_processor = AutoImageProcessor.from_pretrained(checkpoint, do_reduce_labels=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "JAjiYzklZTwr" + }, + "outputs": [], + "source": [ + "from torchvision.transforms import ColorJitter\n", + "\n", + "jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "_HaS12U0ZTwr" + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import numpy as np\n", + "\n", + "\n", + "def handle_grayscale_image(image):\n", + " np_image = np.array(image)\n", + " if np_image.ndim == 2:\n", + " tiled_image = np.tile(np.expand_dims(np_image, -1), 3)\n", + " return Image.fromarray(tiled_image)\n", + " else:\n", + " return Image.fromarray(np_image)\n", + "\n", + "\n", + "def train_transforms(example_batch):\n", + " images = [jitter(handle_grayscale_image(x)) for x in example_batch[\"image\"]]\n", + " labels = [x for x in example_batch[\"annotation\"]]\n", + " inputs = image_processor(images, labels)\n", + " return inputs\n", + "\n", + "\n", + "def val_transforms(example_batch):\n", + " images = [handle_grayscale_image(x) for x in example_batch[\"image\"]]\n", + " labels = [x for x in example_batch[\"annotation\"]]\n", + " inputs = image_processor(images, labels)\n", + " return inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "Qyjsvup2ZTws" + }, + "outputs": [], + "source": [ + "train_ds.set_transform(train_transforms)\n", + "test_ds.set_transform(val_transforms)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Lu8RjicxHJiO" + }, + "source": [ + "## Evaluation function\n", + "\n", + "Including a metric during training is often helpful for evaluating your model’s performance. You can quickly load a evaluation method with the [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union (IoU)](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "TMSnlebfZTwt" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading builder script: 12.9kB [00:00, 34.2MB/s]\n" + ] + } + ], + "source": [ + "import torch\n", + "from torch import nn\n", + "import evaluate\n", + "\n", + "metric = evaluate.load(\"mean_iou\")\n", + "\n", + "\n", + "def compute_metrics(eval_pred):\n", + " with torch.no_grad():\n", + " logits, labels = eval_pred\n", + " logits_tensor = torch.from_numpy(logits)\n", + " # scale the logits to the size of the label\n", + " logits_tensor = nn.functional.interpolate(\n", + " logits_tensor,\n", + " size=labels.shape[-2:],\n", + " mode=\"bilinear\",\n", + " align_corners=False,\n", + " ).argmax(dim=1)\n", + "\n", + " pred_labels = logits_tensor.detach().cpu().numpy()\n", + " # currently using _compute instead of compute\n", + " # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576\n", + " metrics = metric._compute(\n", + " predictions=pred_labels,\n", + " references=labels,\n", + " num_labels=len(id2label),\n", + " ignore_index=0,\n", + " reduce_labels=image_processor.do_reduce_labels,\n", + " )\n", + "\n", + " # add per category metrics as individual key-value pairs\n", + " per_category_accuracy = metrics.pop(\"per_category_accuracy\").tolist()\n", + " per_category_iou = metrics.pop(\"per_category_iou\").tolist()\n", + "\n", + " metrics.update({f\"accuracy_{id2label[i]}\": v for i, v in enumerate(per_category_accuracy)})\n", + " metrics.update({f\"iou_{id2label[i]}\": v for i, v in enumerate(per_category_iou)})\n", + "\n", + " return metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r304cnpxHxp5" + }, + "source": [ + "## Load a base model\n", + "\n", + "For this example, we use the [SegFormer B0 variant](https://huggingface.co/nvidia/mit-b0). " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "Krvppe44a_7y" + }, + "outputs": [], + "source": [ + "def print_trainable_parameters(model):\n", + " \"\"\"\n", + " Prints the number of trainable parameters in the model.\n", + " \"\"\"\n", + " trainable_params = 0\n", + " all_param = 0\n", + " for _, param in model.named_parameters():\n", + " all_param += param.numel()\n", + " if param.requires_grad:\n", + " trainable_params += param.numel()\n", + " print(\n", + " f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q_Wwl_ewID9I" + }, + "source": [ + "We pass the `label2id` and `id2label` dictionaries to let the `AutoModelForSemanticSegmentation` class know that we're interested in a custom base model where the decoder head should be randomly initialized w.r.t our custom dataset. Note, however, that the rest of the model parameters are pre-trained and will be fine-tuned in a regular transfer learning setup.\n", + "\n", + "We also notice that the 100% parameters in the `model` are trainable. " + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kcdLdvIlZTwt", + "outputId": "a6b71dce-905e-4389-dcf6-46b43e769fcc" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 3752694 || all params: 3752694 || trainable%: 100.00\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer\n", + "\n", + "model = AutoModelForSemanticSegmentation.from_pretrained(\n", + " checkpoint, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True\n", + ")\n", + "print_trainable_parameters(model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4yhyYVTCInF0" + }, + "source": [ + "## Wrap `model` as a `PeftModel` for LoRA training\n", + "\n", + "This involves two steps:\n", + "\n", + "* Defining a config with `LoraConfig`\n", + "* Wrapping the original `model` with `get_peft_model()` with the config defined in the step above. " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YPg4W5eFB__n", + "outputId": "9995eb44-1c30-43e7-cc4e-691ecb1b1878" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 566422 || all params: 4317068 || trainable%: 13.12\n" + ] + } + ], + "source": [ + "from peft import LoraConfig, get_peft_model\n", + "\n", + "config = LoraConfig(\n", + " r=32,\n", + " lora_alpha=32,\n", + " target_modules=[\"query\", \"value\"],\n", + " lora_dropout=0.1,\n", + " bias=\"lora_only\",\n", + " modules_to_save=[\"decode_head\"],\n", + ")\n", + "lora_model = get_peft_model(model, config)\n", + "print_trainable_parameters(lora_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4M3wYekOI95X" + }, + "source": [ + " Let's unpack what's going on here. \n", + "\n", + "In order for LoRA to take effect, we need to specify the target modules to `LoraConfig` so that `PeftModel` knows which modules inside our model needs to be amended with LoRA matrices. In this case, we're only interested in targetting the query and value matrices of the attention blocks of the base model. Since the parameters corresponding to these matrices are \"named\" with `query` and `value` respectively, we specify them accordingly in the `target_modules` argument of `LoraConfig`. \n", + "\n", + "We also specify `modules_to_save`. After we wrap our base model `model` with `PeftModel` along with the `config`, we get a new model where only the LoRA parameters are trainable (so-called \"update matrices\") while the pre-trained parameters are kept frozen. These include the parameters of the randomly initialized classifier parameters too. This is NOT we want when fine-tuning the base model on our custom dataset. To ensure that the classifier parameters are also trained, we specify `modules_to_save`. This also ensures that these modules are serialized alongside the LoRA trainable parameters when using utilities like `save_pretrained()` and `push_to_hub()`. \n", + "\n", + "Regarding the other parameters:\n", + "\n", + "* `r`: The dimension used by the LoRA update matrices.\n", + "* `alpha`: Scaling factor.\n", + "* `bias`: Specifying if the `bias` parameters should be trained. `lora_only` denotes only the LoRA `bias` parameters will be trained. \n", + "\n", + "`r` and `alpha` together control the total number of final trainable parameters when using LoRA giving us the flexbility to balance a trade-off between end performance and compute efficiency.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XTF68xfjJEci" + }, + "source": [ + "We can also how many parameters we're actually training. Since we're interested in performing **parameter-efficient fine-tuning**, we should expect to notice a less number of trainable parameters from the `lora_model` in comparison to the original `model` which is indeed the case here. \n", + "\n", + "For sanity, let's also manually verify the modules that are actually trainable in `lora_model`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PUe1Gzvd1PEP", + "outputId": "7b8ba17f-01fd-4ab4-d703-9a0c01cff31b" + }, + "outputs": [], + "source": [ + "for name, param in lora_model.named_parameters():\n", + " if param.requires_grad:\n", + " print(name, param.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can confirm that only the LoRA parameters appended to the attention blocks and the `decode_head` parameters are trainable." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rX75AyI7JYVC" + }, + "source": [ + "## Train!\n", + "\n", + "This is a two-step process: \n", + "\n", + "1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/trainer#transformers.TrainingArguments). It is important you don’t remove unused columns because this’ll drop the image column. Without the image column, you can’t create `pixel_values`. Set `remove_unused_columns=False` to prevent this behavior! The only other required parameter is output_dir which specifies where to save your model. At the end of each epoch, the `Trainer` will evaluate the IoU metric and save the training checkpoint.\n", + "2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.\n", + "3. Call `train()` to finetune your model.\n", + "\n", + "\n", + "**Note** that This example is meant to walk you through the workflow when using PEFT for semantic segmentation. We didn't perform extensive hyperparameter tuning to achieve optimal results. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "K6HVcNkDZTwu", + "outputId": "1b28a072-0e16-4b1a-ec32-d78e93630ef3" + }, + "outputs": [], + "source": [ + "model_name = checkpoint.split(\"/\")[-1]\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=f\"{model_name}-scene-parse-150-lora\",\n", + " learning_rate=5e-4,\n", + " num_train_epochs=50,\n", + " per_device_train_batch_size=4,\n", + " per_device_eval_batch_size=2,\n", + " save_total_limit=3,\n", + " eval_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " logging_steps=5,\n", + " remove_unused_columns=False,\n", + " push_to_hub=True,\n", + " label_names=[\"labels\"],\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=lora_model,\n", + " args=training_args,\n", + " train_dataset=train_ds,\n", + " eval_dataset=test_ds,\n", + " compute_metrics=compute_metrics,\n", + ")\n", + "\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dacaBLE6KLdu" + }, + "source": [ + "## Saving the model and inference \n", + "\n", + "Here we use the `save_pretrained()` method of the `lora_model` to save the *LoRA-only parameters* locally. However, you can also use thr `push_to_hub()` method to upload these parameters directly to the Hugging Face Hub (as shown [here](https://colab.research.google.com/github/huggingface/peft/blob/main/examples/image_classification/image_classification_peft_lora.ipynb)). " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "id": "pvkLkrQo-6l6" + }, + "outputs": [], + "source": [ + "model_id = \"segformer-scene-parse-150-lora\"\n", + "lora_model.save_pretrained(model_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ur8n41kBK4uj" + }, + "source": [ + "We can see that the LoRA-only parameters are just **2.2 MB in size**! This greatly improves the portability when using very large models. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "grzLeOT-__ht", + "outputId": "1ce26a27-2f38-43f3-9454-8ba11f2cfc59" + }, + "outputs": [], + "source": [ + "!ls -lh {model_id}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KFYC6Z3FLB5F" + }, + "source": [ + "Let's now prepare our `inference_model` and run an inference. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "T7zeMQTaACur", + "outputId": "762b7fbc-07d4-4572-f107-c836e3e7928a" + }, + "outputs": [], + "source": [ + "from peft import PeftConfig, PeftModel\n", + "\n", + "config = PeftConfig.from_pretrained(model_id)\n", + "model = AutoModelForSemanticSegmentation.from_pretrained(\n", + " checkpoint, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True\n", + ")\n", + "# Load the Lora model\n", + "inference_model = PeftModel.from_pretrained(model, model_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2L1R0LDWLImd" + }, + "source": [ + "Fetch an image." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 444 + }, + "id": "lwjRvZOmA7Hh", + "outputId": "44ab267d-e2b9-4bda-a52b-91968eabce29" + }, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "\n", + "url = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png\"\n", + "image = Image.open(requests.get(url, stream=True).raw)\n", + "image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kdK_bGhsLKKE" + }, + "source": [ + "Preprocess the image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G0z-3R-PBKc9", + "outputId": "56c91198-0116-4c2c-fc63-147dc7431b89" + }, + "outputs": [], + "source": [ + "# prepare image for the model\n", + "encoding = image_processor(image.convert(\"RGB\"), return_tensors=\"pt\")\n", + "print(encoding.pixel_values.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hJRijta4LLu9" + }, + "source": [ + "Run an inference. " + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "id": "z1p-QDoiBP56" + }, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " outputs = inference_model(pixel_values=encoding.pixel_values)\n", + " logits = outputs.logits\n", + "\n", + "upsampled_logits = nn.functional.interpolate(\n", + " logits,\n", + " size=image.size[::-1],\n", + " mode=\"bilinear\",\n", + " align_corners=False,\n", + ")\n", + "\n", + "pred_seg = upsampled_logits.argmax(dim=1)[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gmYIcfL4LNtj" + }, + "source": [ + "Visualize the results.\n", + "\n", + "We need a color palette to visualize the results. Here, we use [one provided by the TensorFlow Model Garden repository](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51)." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "id": "jy5c6vmzBqzC" + }, + "outputs": [], + "source": [ + "def ade_palette():\n", + " \"\"\"Creates a label colormap used in ADE20K segmentation benchmark.\n", + " Returns:\n", + " A colormap for visualizing segmentation results.\n", + " \"\"\"\n", + " return np.asarray(\n", + " [\n", + " [0, 0, 0],\n", + " [120, 120, 120],\n", + " [180, 120, 120],\n", + " [6, 230, 230],\n", + " [80, 50, 50],\n", + " [4, 200, 3],\n", + " [120, 120, 80],\n", + " [140, 140, 140],\n", + " [204, 5, 255],\n", + " [230, 230, 230],\n", + " [4, 250, 7],\n", + " [224, 5, 255],\n", + " [235, 255, 7],\n", + " [150, 5, 61],\n", + " [120, 120, 70],\n", + " [8, 255, 51],\n", + " [255, 6, 82],\n", + " [143, 255, 140],\n", + " [204, 255, 4],\n", + " [255, 51, 7],\n", + " [204, 70, 3],\n", + " [0, 102, 200],\n", + " [61, 230, 250],\n", + " [255, 6, 51],\n", + " [11, 102, 255],\n", + " [255, 7, 71],\n", + " [255, 9, 224],\n", + " [9, 7, 230],\n", + " [220, 220, 220],\n", + " [255, 9, 92],\n", + " [112, 9, 255],\n", + " [8, 255, 214],\n", + " [7, 255, 224],\n", + " [255, 184, 6],\n", + " [10, 255, 71],\n", + " [255, 41, 10],\n", + " [7, 255, 255],\n", + " [224, 255, 8],\n", + " [102, 8, 255],\n", + " [255, 61, 6],\n", + " [255, 194, 7],\n", + " [255, 122, 8],\n", + " [0, 255, 20],\n", + " [255, 8, 41],\n", + " [255, 5, 153],\n", + " [6, 51, 255],\n", + " [235, 12, 255],\n", + " [160, 150, 20],\n", + " [0, 163, 255],\n", + " [140, 140, 140],\n", + " [250, 10, 15],\n", + " [20, 255, 0],\n", + " [31, 255, 0],\n", + " [255, 31, 0],\n", + " [255, 224, 0],\n", + " [153, 255, 0],\n", + " [0, 0, 255],\n", + " [255, 71, 0],\n", + " [0, 235, 255],\n", + " [0, 173, 255],\n", + " [31, 0, 255],\n", + " [11, 200, 200],\n", + " [255, 82, 0],\n", + " [0, 255, 245],\n", + " [0, 61, 255],\n", + " [0, 255, 112],\n", + " [0, 255, 133],\n", + " [255, 0, 0],\n", + " [255, 163, 0],\n", + " [255, 102, 0],\n", + " [194, 255, 0],\n", + " [0, 143, 255],\n", + " [51, 255, 0],\n", + " [0, 82, 255],\n", + " [0, 255, 41],\n", + " [0, 255, 173],\n", + " [10, 0, 255],\n", + " [173, 255, 0],\n", + " [0, 255, 153],\n", + " [255, 92, 0],\n", + " [255, 0, 255],\n", + " [255, 0, 245],\n", + " [255, 0, 102],\n", + " [255, 173, 0],\n", + " [255, 0, 20],\n", + " [255, 184, 184],\n", + " [0, 31, 255],\n", + " [0, 255, 61],\n", + " [0, 71, 255],\n", + " [255, 0, 204],\n", + " [0, 255, 194],\n", + " [0, 255, 82],\n", + " [0, 10, 255],\n", + " [0, 112, 255],\n", + " [51, 0, 255],\n", + " [0, 194, 255],\n", + " [0, 122, 255],\n", + " [0, 255, 163],\n", + " [255, 153, 0],\n", + " [0, 255, 10],\n", + " [255, 112, 0],\n", + " [143, 255, 0],\n", + " [82, 0, 255],\n", + " [163, 255, 0],\n", + " [255, 235, 0],\n", + " [8, 184, 170],\n", + " [133, 0, 255],\n", + " [0, 255, 92],\n", + " [184, 0, 255],\n", + " [255, 0, 31],\n", + " [0, 184, 255],\n", + " [0, 214, 255],\n", + " [255, 0, 112],\n", + " [92, 255, 0],\n", + " [0, 224, 255],\n", + " [112, 224, 255],\n", + " [70, 184, 160],\n", + " [163, 0, 255],\n", + " [153, 0, 255],\n", + " [71, 255, 0],\n", + " [255, 0, 163],\n", + " [255, 204, 0],\n", + " [255, 0, 143],\n", + " [0, 255, 235],\n", + " [133, 255, 0],\n", + " [255, 0, 235],\n", + " [245, 0, 255],\n", + " [255, 0, 122],\n", + " [255, 245, 0],\n", + " [10, 190, 212],\n", + " [214, 255, 0],\n", + " [0, 204, 255],\n", + " [20, 0, 255],\n", + " [255, 255, 0],\n", + " [0, 153, 255],\n", + " [0, 41, 255],\n", + " [0, 255, 204],\n", + " [41, 0, 255],\n", + " [41, 255, 0],\n", + " [173, 0, 255],\n", + " [0, 245, 255],\n", + " [71, 0, 255],\n", + " [122, 0, 255],\n", + " [0, 255, 184],\n", + " [0, 92, 255],\n", + " [184, 255, 0],\n", + " [0, 133, 255],\n", + " [255, 214, 0],\n", + " [25, 194, 194],\n", + " [102, 255, 0],\n", + " [92, 0, 255],\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 595 + }, + "id": "3KJFvgENBih0", + "outputId": "63d42e4f-3867-4d33-8ac0-83bebf8819ca" + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)\n", + "palette = np.array(ade_palette())\n", + "\n", + "for label, color in enumerate(palette):\n", + " color_seg[pred_seg == label, :] = color\n", + "color_seg = color_seg[..., ::-1] # convert to BGR\n", + "\n", + "img = np.array(image) * 0.5 + color_seg * 0.5 # plot the image with the segmentation map\n", + "img = img.astype(np.uint8)\n", + "\n", + "plt.figure(figsize=(15, 10))\n", + "plt.imshow(img)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q1aGuHYFLP7i" + }, + "source": [ + "The results are definitely not as expected and as mentioned above, this example is not meant to provide a state-of-the-art model. It exists to familiarize you with the end-to-end workflow. \n", + "\n", + "On the other hand, if you perform full fine-tuning on the same setup (same model variant, same dataset, same training schedule, etc.), the results would not have been any different. This is a crucial aspect of parameter-efficient fine-tuning -- to be able to match up to the results of the full fine-tuning but with a fraction of total trainable parameters. \n", + "\n", + "Here are some things that you can try to get better results:\n", + "\n", + "* Increase the number of training samples. \n", + "* Try a larger SegFormer model variant (know about the available model variants [here](https://huggingface.co/models?search=segformer)). \n", + "* Try different values for the arguments available in `LoraConfig`. \n", + "* Tune the learning rate and batch size. " + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "machine_shape": "hm", + "provenance": [] + }, + "gpuClass": "premium", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0dcc5a2866a349e0843673bef499dc66": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_24fde588dc1a49a397e379cda320ed71", + "placeholder": "​", + "style": "IPY_MODEL_9746875e74b845daab06619393b4d46b", + "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " + } + }, + "14ac809ba0bc4cd5bcd51f83105947b0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21bf954f54db41e6ba78f76195721614": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "24fde588dc1a49a397e379cda320ed71": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f69c7b15e5e48039777e4b6b1a51f53": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "42a7b8268d8945b5bbe9d3f20bc8840c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "554b55f29a5e4d5a81608d912d3635e8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5bcfe3da0ffb41ccbb9404ac35ae8945": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6c6d19cd893e4d82bae9972fa10c6d74": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "PasswordModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "PasswordModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "PasswordView", + "continuous_update": true, + "description": "Token:", + "description_tooltip": null, + "disabled": false, + "layout": "IPY_MODEL_42a7b8268d8945b5bbe9d3f20bc8840c", + "placeholder": "​", + "style": "IPY_MODEL_554b55f29a5e4d5a81608d912d3635e8", + "value": "" + } + }, + "6c88a55a635b4c9f946a1aa838d69f20": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_14ac809ba0bc4cd5bcd51f83105947b0", + "placeholder": "​", + "style": "IPY_MODEL_e7393e78f41b496495982490b72ef2a3", + "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" + } + }, + "9746875e74b845daab06619393b4d46b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b7431f99d93b4e9b8c8177ac4a7b4070": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": "center", + "align_self": null, + "border": null, + "bottom": null, + "display": "flex", + "flex": null, + "flex_flow": "column", + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": "50%" + } + }, + "cb4053f102fc4207a1c9513f81ad6415": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ButtonView", + "button_style": "", + "description": "Login", + "disabled": false, + "icon": "", + "layout": "IPY_MODEL_21bf954f54db41e6ba78f76195721614", + "style": "IPY_MODEL_f83dd354396e4aa3acd214a6fd98efb2", + "tooltip": "" + } + }, + "e7393e78f41b496495982490b72ef2a3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f2a722f371904cce80dc1c087b153ad6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "VBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6c88a55a635b4c9f946a1aa838d69f20", + "IPY_MODEL_6c6d19cd893e4d82bae9972fa10c6d74", + "IPY_MODEL_fc48ee28c2e44f1daa03149c8004c314", + "IPY_MODEL_cb4053f102fc4207a1c9513f81ad6415", + "IPY_MODEL_0dcc5a2866a349e0843673bef499dc66" + ], + "layout": "IPY_MODEL_b7431f99d93b4e9b8c8177ac4a7b4070" + } + }, + "f83dd354396e4aa3acd214a6fd98efb2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ButtonStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ButtonStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "button_color": null, + "font_weight": "" + } + }, + "fc48ee28c2e44f1daa03149c8004c314": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "CheckboxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "CheckboxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "CheckboxView", + "description": "Add token as git credential?", + "description_tooltip": null, + "disabled": false, + "indent": true, + "layout": "IPY_MODEL_3f69c7b15e5e48039777e4b6b1a51f53", + "style": "IPY_MODEL_5bcfe3da0ffb41ccbb9404ac35ae8945", + "value": true + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/peft/examples/sequence_classification/C3A.ipynb b/peft/examples/sequence_classification/C3A.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..dbd8282a76cb3e4655d63b2e4411c4c0bd3dd3dc --- /dev/null +++ b/peft/examples/sequence_classification/C3A.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d36e1e93-ae93-4a4e-93c6-68fd868d2882", + "metadata": {}, + "source": [ + "# Using C3A for sequence classification" + ] + }, + { + "cell_type": "markdown", + "id": "ddfc0610-55f6-4343-a950-125ccf0f45ac", + "metadata": {}, + "source": [ + "In this example, we fine-tune Roberta (base) on a sequence classification task using C3A." + ] + }, + { + "cell_type": "markdown", + "id": "45addd81-d4f3-4dfd-960d-3920d347f0a6", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9935ae2", + "metadata": {}, + "outputs": [], + "source": [ + "# To run this notebook, please run `pip install evaluate` to install additional dependencies not covered by PEFT.\n", + "import torch\n", + "from torch.optim import AdamW\n", + "from torch.utils.data import DataLoader\n", + "from peft import (\n", + " get_peft_model,\n", + " C3AConfig,\n", + " PeftType,\n", + ")\n", + "from peft.utils import infer_device\n", + "\n", + "import evaluate\n", + "from datasets import load_dataset\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed, AutoConfig\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "62c959bf-7cc2-49e0-b97e-4c10ec3b9bf3", + "metadata": {}, + "source": [ + "## Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3b13308", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 32\n", + "model_name_or_path = \"roberta-base\"\n", + "task = \"mrpc\"\n", + "peft_type = PeftType.C3A\n", + "device = infer_device()\n", + "num_epochs = 5 # for better results, increase this number\n", + "block_size = 768 # for better results, increase this number\n", + "max_length = 512\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0526f571", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = C3AConfig(\n", + " task_type=\"SEQ_CLS\", \n", + " block_size=block_size,\n", + " target_modules=[\"query\", \"value\"],\n", + ")\n", + "head_lr = 4e-6 # the learning rate for the classification head for NLU tasks\n", + "ft_lr = 3e-1 # the learning rate for C3A parameters, a much larger LR than that is usually used, at least 1e-1" + ] + }, + { + "cell_type": "markdown", + "id": "c075c5d2-a457-4f37-a7f1-94fd0d277972", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7bb52cb4-d1c3-4b04-8bf0-f39ca88af139", + "metadata": {}, + "outputs": [], + "source": [ + "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n", + " padding_side = \"left\"\n", + "else:\n", + " padding_side = \"right\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n", + "if getattr(tokenizer, \"pad_token_id\") is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e69c5e1f-d27b-4264-a41e-fc9b99d025e6", + "metadata": {}, + "outputs": [], + "source": [ + "datasets = load_dataset(\"glue\", task)\n", + "metric = evaluate.load(\"glue\", task)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0209f778-c93b-40eb-a4e0-24c25db03980", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_function(examples):\n", + " # max_length=None => use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=max_length)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7453954e-982c-46f0-b09c-589776e6d6cb", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f3b9b2e8-f415-4d0f-9fb4-436f1a3585ea", + "metadata": {}, + "source": [ + "## Preparing the C3A model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ed5ac74", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 610,562 || all params: 125,257,732 || trainable%: 0.4874\n" + ] + } + ], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2d0381", + "metadata": {}, + "outputs": [], + "source": [ + "head_param = list(map(id, model.classifier.parameters()))\n", + "\n", + "others_param = filter(lambda p: id(p) not in head_param, model.parameters()) \n", + "\n", + "optimizer = AdamW([\n", + " {\"params\": model.classifier.parameters(), \"lr\": head_lr},\n", + " {\"params\": others_param, \"lr\": ft_lr}\n", + "],weight_decay=0.)\n", + "\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c0dd5aa8-977b-4ac0-8b96-884b17bcdd00", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_size = 32\n", + "model_name_or_path = \"roberta-base\"\n", + "task = \"mrpc\"\n", + "peft_type = PeftType.FOURIERFT\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "num_epochs = 5 # for better results, increase this number\n", + "n_frequency = 1000 # for better results, increase this number\n", + "scaling = 150.0\n", + "max_length = 512\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0526f571", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = FourierFTConfig(\n", + " task_type=\"SEQ_CLS\", \n", + " n_frequency=n_frequency,\n", + " target_modules=[\"query\", \"value\"],\n", + " scaling = scaling,\n", + ")\n", + "head_lr = 6e-3 # the learning rate for the classification head for NLU tasks\n", + "fft_lr = 6e-2 # the learning rate for the parameters other than the classification head (q,v in this case)" + ] + }, + { + "cell_type": "markdown", + "id": "c075c5d2-a457-4f37-a7f1-94fd0d277972", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7bb52cb4-d1c3-4b04-8bf0-f39ca88af139", + "metadata": {}, + "outputs": [], + "source": [ + "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n", + " padding_side = \"left\"\n", + "else:\n", + " padding_side = \"right\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n", + "if getattr(tokenizer, \"pad_token_id\") is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e69c5e1f-d27b-4264-a41e-fc9b99d025e6", + "metadata": {}, + "outputs": [], + "source": [ + "datasets = load_dataset(\"glue\", task)\n", + "metric = evaluate.load(\"glue\", task)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0209f778-c93b-40eb-a4e0-24c25db03980", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_function(examples):\n", + " # max_length=None => use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=max_length)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7453954e-982c-46f0-b09c-589776e6d6cb", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f3b9b2e8-f415-4d0f-9fb4-436f1a3585ea", + "metadata": {}, + "source": [ + "## Preparing the FourierFT model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ed5ac74", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 616,130 || all params: 125,263,300 || trainable%: 0.4919\n" + ] + } + ], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2d0381", + "metadata": {}, + "outputs": [], + "source": [ + "head_param = list(map(id, model.classifier.parameters()))\n", + "\n", + "others_param = filter(lambda p: id(p) not in head_param, model.parameters()) \n", + "\n", + "optimizer = AdamW([\n", + " {\"params\": model.classifier.parameters(), \"lr\": head_lr},\n", + " {\"params\": others_param, \"lr\": fft_lr}\n", + "],weight_decay=0.)\n", + "\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c0dd5aa8-977b-4ac0-8b96-884b17bcdd00", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", + "\n", + "\n", + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")\n", + "test_dataloader = DataLoader(tokenized_datasets[\"test\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2ed5ac74", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "0cecb897c86c4892b94a1990ab08a926", + "b8af0294819e4280ad41fa1c11006adf", + "c7530d63b2f745e799713284abacbd2c", + "12a1e302a69543c5bc0e0a66be008ca0", + "9c372e9e9b20433faed8530ca0f4424c", + "b07f26a21325493cac19113f1aa1ee96", + "fbdf6c544fb54294903524a69384e773", + "6c6f2223243b4a7485aef7fbbfe07668", + "a93509d61ac94628a74bc0f98c0eec06", + "3a8de0eb7db44647a734590b6b351b44", + "64d8affd2e854a1c9043fec7ca8a2796" + ] + }, + "id": "2ed5ac74", + "outputId": "18ea15ac-ed8d-4d80-b166-706681ee49ab" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0cecb897c86c4892b94a1990ab08a926", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading model.safetensors: 0%| | 0.00/1.42G [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2", + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0288f311-8475-4a0e-99af-e4b909d10e01", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(\n", + " tokenized_datasets[\"train\"],\n", + " shuffle=True,\n", + " collate_fn=collate_fn,\n", + " batch_size=batch_size,\n", + ")\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"],\n", + " shuffle=False,\n", + " collate_fn=collate_fn,\n", + " batch_size=batch_size,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2ac42f98e60d412496fe77ed7eb5c6df", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00, weight=AffineQuantizedTensor(shape=torch.Size([2048, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=2304, out_features=16, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=16, out_features=2048, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (k_proj): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (v_proj): lora.TorchaoLoraLinear(\n", + " (base_layer): Linear(in_features=2304, out_features=1024, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([1024, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (lora_dropout): ModuleDict(\n", + " (default): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (lora_A): ModuleDict(\n", + " (default): Linear(in_features=2304, out_features=16, bias=False)\n", + " )\n", + " (lora_B): ModuleDict(\n", + " (default): Linear(in_features=16, out_features=1024, bias=False)\n", + " )\n", + " (lora_embedding_A): ParameterDict()\n", + " (lora_embedding_B): ParameterDict()\n", + " (lora_magnitude_vector): ModuleDict()\n", + " )\n", + " (o_proj): Linear(in_features=2048, out_features=2304, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([2304, 2048]), block_size=(1, 2048), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (rotary_emb): Gemma2RotaryEmbedding()\n", + " )\n", + " (mlp): Gemma2MLP(\n", + " (gate_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (up_proj): Linear(in_features=2304, out_features=9216, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([9216, 2304]), block_size=(1, 2304), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (down_proj): Linear(in_features=9216, out_features=2304, weight=LinearActivationQuantizedTensor(activation=, weight=AffineQuantizedTensor(shape=torch.Size([2304, 9216]), block_size=(1, 9216), device=cuda:0, layout_type=PlainLayoutType(), layout_tensor_dtype=torch.int8, quant_min=None, quant_max=None)))\n", + " (act_fn): PytorchGELUTanh()\n", + " )\n", + " (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n", + " (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n", + " (pre_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n", + " (post_feedforward_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)\n", + " )\n", + " )\n", + " (norm): Gemma2RMSNorm((2304,), eps=1e-06)\n", + " )\n", + " (score): ModulesToSaveWrapper(\n", + " (original_module): Linear(in_features=2304, out_features=2, bias=False)\n", + " (modules_to_save): ModuleDict(\n", + " (default): Linear(in_features=2304, out_features=2, bias=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.config.use_cache = False\n", + "model.to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/230 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cf5ef289-f42f-4582-bd5e-9852ad8beff2", + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "739b3655-9db0-48bc-8542-308c6d5e0b8b", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0288f311-8475-4a0e-99af-e4b909d10e01", + "metadata": {}, + "outputs": [], + "source": [ + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(\n", + " tokenized_datasets[\"train\"],\n", + " shuffle=True,\n", + " collate_fn=collate_fn,\n", + " batch_size=batch_size,\n", + ")\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"],\n", + " shuffle=False,\n", + " collate_fn=collate_fn,\n", + " batch_size=batch_size,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fcaf6f9e-c9d1-445a-9f08-18ef462f67ce", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e5dfff56-ea80-4561-aeaf-43216bbb9af7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "512d9dc10a4d4ecc88b9440575b0973a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/3 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", + "\n", + "\n", + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ed5ac74", + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0d2d0381", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = AdamW(params=model.parameters(), lr=lr)\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", + "\n", + "\n", + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6bc8144", + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af41c571", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = AdamW(params=model.parameters(), lr=lr)\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0, # 0.06*(len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "90993c93", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", + "\n", + "\n", + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3c15af0", + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6d3c5edb", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = AdamW(params=model.parameters(), lr=lr)\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4d279225", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_size = 32\n", + "model_name_or_path = \"roberta-large\"\n", + "task = \"mrpc\"\n", + "peft_type = PeftType.VBLORA\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "num_epochs = 20\n", + "rank = 4\n", + "max_length = 128\n", + "num_vectors = 90\n", + "vector_length = 256\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0526f571", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = VBLoRAConfig(\n", + " task_type=\"SEQ_CLS\", \n", + " r=rank,\n", + " topk=2,\n", + " target_modules=['key', 'value', 'query', 'output.dense', 'intermediate.dense'],\n", + " num_vectors=num_vectors,\n", + " vector_length=vector_length,\n", + " save_only_topk_weights=True, # Set to True to reduce storage space. Note that the saved parameters cannot be used to resume training from checkpoints.\n", + " vblora_dropout=0.,\n", + ")\n", + "head_lr = 4e-3\n", + "vector_bank_lr = 1e-3\n", + "logits_lr = 1e-2" + ] + }, + { + "cell_type": "markdown", + "id": "c075c5d2-a457-4f37-a7f1-94fd0d277972", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7bb52cb4-d1c3-4b04-8bf0-f39ca88af139", + "metadata": {}, + "outputs": [], + "source": [ + "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n", + " padding_side = \"left\"\n", + "else:\n", + " padding_side = \"right\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n", + "if getattr(tokenizer, \"pad_token_id\") is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e69c5e1f-d27b-4264-a41e-fc9b99d025e6", + "metadata": {}, + "outputs": [], + "source": [ + "datasets = load_dataset(\"glue\", task)\n", + "metric = evaluate.load(\"glue\", task)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0209f778-c93b-40eb-a4e0-24c25db03980", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_function(examples):\n", + " # max_length=None => use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=max_length)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7453954e-982c-46f0-b09c-589776e6d6cb", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f3b9b2e8-f415-4d0f-9fb4-436f1a3585ea", + "metadata": {}, + "source": [ + "## Preparing the VB-LoRA model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ed5ac74", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 1,696,770 || all params: 357,058,564 || trainable%: 0.4752\n", + "VB-LoRA params to-be-saved (float32-equivalent): 33,408 || total params to-be-saved: 1,085,058\n" + ] + } + ], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model.print_savable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2d0381", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS\n", + "from transformers.trainer_pt_utils import get_parameter_names\n", + "\n", + "decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS)\n", + "decay_parameters = [name for name in decay_parameters if \"bias\" not in name]\n", + "vector_bank_parameters = [name for name, _ in model.named_parameters() if \"vector_bank\" in name]\n", + "logits_parameters = [name for name, _ in model.named_parameters() if \"logits\" in name ]\n", + "\n", + "optimizer_grouped_parameters = [\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if n in decay_parameters and \\\n", + " n not in logits_parameters and n not in vector_bank_parameters],\n", + " \"weight_decay\": 0.1,\n", + " \"lr\": head_lr,\n", + " },\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if n not in decay_parameters and \\\n", + " n not in logits_parameters and n not in vector_bank_parameters],\n", + " \"weight_decay\": 0.0,\n", + " \"lr\": head_lr,\n", + " },\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if n in vector_bank_parameters],\n", + " \"lr\": vector_bank_lr,\n", + " \"weight_decay\": 0.0,\n", + " },\n", + " {\n", + " \"params\": [p for n, p in model.named_parameters() if n in logits_parameters],\n", + " \"lr\": logits_lr,\n", + " \"weight_decay\": 0.0,\n", + " },\n", + "]\n", + "\n", + "optimizer = AdamW(optimizer_grouped_parameters)\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c0dd5aa8-977b-4ac0-8b96-884b17bcdd00", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch_size = 128\n", + "model_name_or_path = \"roberta-base\"\n", + "task = \"mrpc\"\n", + "peft_type = PeftType.VERA\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "num_epochs = 5 # for best results, increase this number\n", + "rank = 8 # for best results, increase this number\n", + "max_length = 128\n", + "torch.manual_seed(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0526f571", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = VeraConfig(\n", + " task_type=\"SEQ_CLS\", \n", + " r=rank,\n", + " d_initial=0.1,\n", + " target_modules=[\"query\", \"value\", \"intermediate.dense\"],\n", + " save_projection=True,\n", + ")\n", + "head_lr = 1e-2\n", + "vera_lr = 2e-2" + ] + }, + { + "cell_type": "markdown", + "id": "c075c5d2-a457-4f37-a7f1-94fd0d277972", + "metadata": {}, + "source": [ + "## Loading data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7bb52cb4-d1c3-4b04-8bf0-f39ca88af139", + "metadata": {}, + "outputs": [], + "source": [ + "if any(k in model_name_or_path for k in (\"gpt\", \"opt\", \"bloom\")):\n", + " padding_side = \"left\"\n", + "else:\n", + " padding_side = \"right\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)\n", + "if getattr(tokenizer, \"pad_token_id\") is None:\n", + " tokenizer.pad_token_id = tokenizer.eos_token_id" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e69c5e1f-d27b-4264-a41e-fc9b99d025e6", + "metadata": {}, + "outputs": [], + "source": [ + "datasets = load_dataset(\"glue\", task)\n", + "metric = evaluate.load(\"glue\", task)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0209f778-c93b-40eb-a4e0-24c25db03980", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_function(examples):\n", + " # max_length=None => use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=max_length)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7453954e-982c-46f0-b09c-589776e6d6cb", + "metadata": {}, + "outputs": [], + "source": [ + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f3b9b2e8-f415-4d0f-9fb4-436f1a3585ea", + "metadata": {}, + "source": [ + "## Preparing the VeRA model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ed5ac74", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 647,714 || all params: 125,294,884 || trainable%: 0.5170\n" + ] + } + ], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d2d0381", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = AdamW(\n", + " [\n", + " {\"params\": [p for n, p in model.named_parameters() if \"vera_lambda_\" in n], \"lr\": vera_lr},\n", + " {\"params\": [p for n, p in model.named_parameters() if \"classifier\" in n], \"lr\": head_lr},\n", + " ]\n", + ")\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c0dd5aa8-977b-4ac0-8b96-884b17bcdd00", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "fa0e73be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/29 [00:00 use the model max length (it's actually the default) + outputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=None) + return outputs + + def collate_fn(examples): + return tokenizer.pad(examples, padding="longest", return_tensors="pt") + + with accelerator.main_process_first(): + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + remove_columns=["idx", "sentence1", "sentence2"], + ) + + # We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the + # transformers library + tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + + # Instantiate dataloaders. + train_dataloader = DataLoader( + tokenized_datasets["train"], shuffle=True, collate_fn=collate_fn, batch_size=args.per_device_train_batch_size + ) + eval_dataloader = DataLoader( + tokenized_datasets["validation"], + shuffle=False, + collate_fn=collate_fn, + batch_size=args.per_device_eval_batch_size, + ) + + model = AutoModelForSequenceClassification.from_pretrained(args.model_name_or_path) + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + if getattr(accelerator.state, "fsdp_plugin", None) is not None: + accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model) + model = accelerator.prepare(model) + + optimizer = AdamW(params=model.parameters(), lr=args.learning_rate) + + # Instantiate scheduler + lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=args.num_warmup_steps, + num_training_steps=(len(train_dataloader) * args.num_train_epochs), + ) + + if getattr(accelerator.state, "fsdp_plugin", None) is not None: + train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare( + train_dataloader, eval_dataloader, optimizer, lr_scheduler + ) + else: + model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare( + model, train_dataloader, eval_dataloader, optimizer, lr_scheduler + ) + + for epoch in range(args.num_train_epochs): + model.train() + for step, batch in enumerate(tqdm(train_dataloader)): + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + model.eval() + samples_seen = 0 + for step, batch in enumerate(tqdm(eval_dataloader)): + with torch.no_grad(): + outputs = model(**batch) + predictions = outputs.logits.argmax(dim=-1) + predictions, references = accelerator.gather((predictions, batch["labels"])) + # If we are in a multiprocess environment, the last batch has duplicates + if accelerator.num_processes > 1: + if step == len(eval_dataloader) - 1: + predictions = predictions[: len(eval_dataloader.dataset) - samples_seen] + references = references[: len(eval_dataloader.dataset) - samples_seen] + else: + samples_seen += references.shape[0] + metric.add_batch( + predictions=predictions, + references=references, + ) + eval_metric = metric.compute() + accelerator.print(f"epoch {epoch}:", eval_metric) + + accelerator.wait_for_everyone() + unwrapped_model = accelerator.unwrap_model(model) + unwrapped_model.save_pretrained(args.output_dir, state_dict=accelerator.get_state_dict(model)) + if accelerator.is_main_process: + tokenizer.save_pretrained(args.output_dir) + + +if __name__ == "__main__": + main() diff --git a/peft/examples/sequence_classification/prefix_tuning.ipynb b/peft/examples/sequence_classification/prefix_tuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..bd2fc1fa1b29a795e93662124cfd3412b1c51d66 --- /dev/null +++ b/peft/examples/sequence_classification/prefix_tuning.ipynb @@ -0,0 +1,710 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a825ba6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "===================================BUG REPORT===================================\n", + "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n", + "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n", + "================================================================================\n", + "CUDA SETUP: CUDA runtime path found: /home/sourab/miniconda3/envs/ml/lib/libcudart.so\n", + "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n", + "CUDA SETUP: Detected CUDA version 117\n", + "CUDA SETUP: Loading binary /home/sourab/miniconda3/envs/ml/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n" + ] + } + ], + "source": [ + "import argparse\n", + "import os\n", + "\n", + "import torch\n", + "from torch.optim import AdamW\n", + "from torch.utils.data import DataLoader\n", + "from peft import (\n", + " get_peft_config,\n", + " get_peft_model,\n", + " get_peft_model_state_dict,\n", + " set_peft_model_state_dict,\n", + " PeftType,\n", + " PrefixTuningConfig,\n", + " PromptEncoderConfig,\n", + ")\n", + "\n", + "import evaluate\n", + "from datasets import load_dataset\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bd7cbb2", + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 32\n", + "model_name_or_path = \"roberta-large\"\n", + "task = \"mrpc\"\n", + "peft_type = PeftType.PREFIX_TUNING\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "num_epochs = 20" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "33d9b62e", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = PrefixTuningConfig(task_type=\"SEQ_CLS\", num_virtual_tokens=20)\n", + "lr = 1e-2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "152b6177", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset glue (/home/sourab/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "be1eddbb9a7d4e6dae32fd026e167f96", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00 use the model max length (it's actually the default)\n", + " outputs = tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], truncation=True, max_length=None)\n", + " return outputs\n", + "\n", + "\n", + "tokenized_datasets = datasets.map(\n", + " tokenize_function,\n", + " batched=True,\n", + " remove_columns=[\"idx\", \"sentence1\", \"sentence2\"],\n", + ")\n", + "\n", + "# We also rename the 'label' column to 'labels' which is the expected name for labels by the models of the\n", + "# transformers library\n", + "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n", + "\n", + "\n", + "def collate_fn(examples):\n", + " return tokenizer.pad(examples, padding=\"longest\", return_tensors=\"pt\")\n", + "\n", + "\n", + "# Instantiate dataloaders.\n", + "train_dataloader = DataLoader(tokenized_datasets[\"train\"], shuffle=True, collate_fn=collate_fn, batch_size=batch_size)\n", + "eval_dataloader = DataLoader(\n", + " tokenized_datasets[\"validation\"], shuffle=False, collate_fn=collate_fn, batch_size=batch_size\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6bc8144", + "metadata": {}, + "outputs": [], + "source": [ + "model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True)\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()\n", + "model" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af41c571", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = AdamW(params=model.parameters(), lr=lr)\n", + "\n", + "# Instantiate scheduler\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_epochs),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "90993c93", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/115 [00:00 + + +Optimization in Unsloth to speed up QLoRA finetuning while reducing GPU memory usage + +## Multi-GPU SFT with QLoRA +To speed up QLoRA finetuning when you have access to multiple GPUs, look at the launch command at [run_peft_multigpu.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_multigpu.sh). This example to performs DDP on 8 GPUs. + +Note: +1. At present, `use_reentrant` needs to be `False` when using gradient checkpointing with Multi-GPU QLoRA else it will lead to errors. However, this leads to huge GPU memory consumption. + +## Multi-GPU SFT with LoRA and DeepSpeed +When you have access to multiple GPUs, it would be better to use normal LoRA with DeepSpeed/FSDP. To use LoRA with DeepSpeed, refer to the docs at [PEFT with DeepSpeed](https://huggingface.co/docs/peft/accelerate/deepspeed). + + +## Multi-GPU SFT with LoRA and FSDP +When you have access to multiple GPUs, it would be better to use normal LoRA with DeepSpeed/FSDP. To use LoRA with FSDP, refer to the docs at [PEFT with FSDP](https://huggingface.co/docs/peft/accelerate/fsdp). + + +## Multi-GPU SFT with LoRA and FSDP for GPTQModel: +As in [Multi-GPU SFT with LoRA and FSDP](https://github.com/huggingface/peft/blob/main/examples/sft/README.md#multi-gpu-sft-with-lora-and-fsdp), we also support other quantization methods like GPTQModel. You may need to install [GPTQModel](https://github.com/ModelCloud/GPTQModel) > v3.0.0 or from source. Here is the launch command for reference: [run_peft_fsdp_gptq.sh]. For the `--model_name_or_path` argument, it is important to pass a model that is already quantized with GPTQModel, like `"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"`. + +Note: there is a bug in transformers v4.53.0 for this case, please skip this transformers version. + +## Tip + +Generally try to upgrade to the latest package versions for best results, especially when it comes to `bitsandbytes`, `accelerate`, `transformers`, `trl`, and `peft`. diff --git a/peft/examples/sft/configs/deepspeed_config.yaml b/peft/examples/sft/configs/deepspeed_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..56eef6e48b75727ede1d522b595b5ecde3a205a1 --- /dev/null +++ b/peft/examples/sft/configs/deepspeed_config.yaml @@ -0,0 +1,23 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + gradient_accumulation_steps: 4 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/peft/examples/sft/configs/deepspeed_config_z3_qlora.yaml b/peft/examples/sft/configs/deepspeed_config_z3_qlora.yaml new file mode 100644 index 0000000000000000000000000000000000000000..07dddcef7853b963ea64d20d39078893c4e21c33 --- /dev/null +++ b/peft/examples/sft/configs/deepspeed_config_z3_qlora.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + deepspeed_multinode_launcher: standard + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/peft/examples/sft/configs/fsdp_config.yaml b/peft/examples/sft/configs/fsdp_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7cccb74ca0dfa1a916b5057ceee561271c301ec5 --- /dev/null +++ b/peft/examples/sft/configs/fsdp_config.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/peft/examples/sft/configs/fsdp_config_qlora.yaml b/peft/examples/sft/configs/fsdp_config_qlora.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f28a0f1046a735579045655dcdb9d3bf7c6ffdcc --- /dev/null +++ b/peft/examples/sft/configs/fsdp_config_qlora.yaml @@ -0,0 +1,25 @@ +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: true + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: 'no' +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/peft/examples/sft/requirements.txt b/peft/examples/sft/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..752b6976e62f4acff6193804c5422807bc5e8806 --- /dev/null +++ b/peft/examples/sft/requirements.txt @@ -0,0 +1,25 @@ +git+https://github.com/huggingface/transformers +git+https://github.com/huggingface/accelerate +git+https://github.com/huggingface/peft +git+https://github.com/huggingface/trl +git+https://github.com/huggingface/datatrove.git +unsloth[conda]@git+https://github.com/unslothai/unsloth.git +deepspeed +PyGithub +flash-attn +huggingface-hub +evaluate +datasets +bitsandbytes +einops +wandb +tensorboard +tiktoken +pandas +numpy +scipy +matplotlib +sentencepiece +nltk +xformers +hf_transfer \ No newline at end of file diff --git a/peft/examples/sft/requirements_colab.txt b/peft/examples/sft/requirements_colab.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8c375dc44bf85305fa53964537f4877210b10bd --- /dev/null +++ b/peft/examples/sft/requirements_colab.txt @@ -0,0 +1,25 @@ +git+https://github.com/huggingface/transformers +git+https://github.com/huggingface/accelerate +git+https://github.com/huggingface/peft +git+https://github.com/huggingface/trl +unsloth[colab_ampere] @ git+https://github.com/unslothai/unsloth.git +datasets +deepspeed +PyGithub +flash-attn +huggingface-hub +evaluate +bitsandbytes +einops +wandb +tensorboard +tiktoken +pandas +numpy +scipy +matplotlib +sentencepiece +nltk +xformers +git+https://github.com/huggingface/datatrove.git +hf_transfer \ No newline at end of file diff --git a/peft/examples/sft/requirements_xpu.txt b/peft/examples/sft/requirements_xpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e5918b95180acbdc5789115ba0af71c8adf3aa5 --- /dev/null +++ b/peft/examples/sft/requirements_xpu.txt @@ -0,0 +1,22 @@ +git+https://github.com/huggingface/transformers +git+https://github.com/huggingface/accelerate +git+https://github.com/huggingface/peft +git+https://github.com/huggingface/trl +git+https://github.com/huggingface/datatrove.git +deepspeed +PyGithub +huggingface-hub +evaluate +datasets +bitsandbytes +einops +wandb +tensorboard +tiktoken +pandas +numpy +scipy +matplotlib +sentencepiece +nltk +hf_transfer \ No newline at end of file diff --git a/peft/examples/sft/run_peft.sh b/peft/examples/sft/run_peft.sh new file mode 100644 index 0000000000000000000000000000000000000000..8aa48648d34636eb07cd25c0abd91b943c00b49f --- /dev/null +++ b/peft/examples/sft/run_peft.sh @@ -0,0 +1,41 @@ +python train.py \ +--seed 100 \ +--model_name_or_path "mistralai/Mistral-7B-v0.1" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "mistral-sft-lora" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 8 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True diff --git a/peft/examples/sft/run_peft_deepspeed.sh b/peft/examples/sft/run_peft_deepspeed.sh new file mode 100644 index 0000000000000000000000000000000000000000..95dbf08892db530b6b5fac89c9bba5995cb6ea0b --- /dev/null +++ b/peft/examples/sft/run_peft_deepspeed.sh @@ -0,0 +1,39 @@ +accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "mistral-sft-lora-deepspeed" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization False \ No newline at end of file diff --git a/peft/examples/sft/run_peft_fsdp.sh b/peft/examples/sft/run_peft_fsdp.sh new file mode 100644 index 0000000000000000000000000000000000000000..63dd475f44ad062f6241ebee9c4fa9047bcace2b --- /dev/null +++ b/peft/examples/sft/run_peft_fsdp.sh @@ -0,0 +1,39 @@ +accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "mistral-sft-lora-fsdp" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization False \ No newline at end of file diff --git a/peft/examples/sft/run_peft_fsdp_gptq.sh b/peft/examples/sft/run_peft_fsdp_gptq.sh new file mode 100644 index 0000000000000000000000000000000000000000..479a7eac834aea664f02c6b6ae6de359290ed3dd --- /dev/null +++ b/peft/examples/sft/run_peft_fsdp_gptq.sh @@ -0,0 +1,36 @@ +accelerate launch --config_file "configs/fsdp_config.yaml" train.py \ +--seed 100 \ +--model_name_or_path "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama3-8B-gptq-sft-lora-fsdp" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 4 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "q_proj,k_proj,v_proj,o_proj,up_proj,gate_proj" \ +--use_4bit_quantization False \ No newline at end of file diff --git a/peft/examples/sft/run_peft_multigpu.sh b/peft/examples/sft/run_peft_multigpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..dbd108d0e055c4fad1076068d15ee1560faa96ad --- /dev/null +++ b/peft/examples/sft/run_peft_multigpu.sh @@ -0,0 +1,41 @@ +torchrun --nproc_per_node 8 --nnodes 1 train.py \ +--seed 100 \ +--model_name_or_path "mistralai/Mistral-7B-v0.1" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "mistral-sft-lora-multigpu" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 8 \ +--gradient_checkpointing True \ +--use_reentrant False \ +--dataset_text_field "content" \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True diff --git a/peft/examples/sft/run_peft_qlora_deepspeed_stage3.sh b/peft/examples/sft/run_peft_qlora_deepspeed_stage3.sh new file mode 100644 index 0000000000000000000000000000000000000000..4bbc1bbcc4bd06351add38d7d5142da02787d5d4 --- /dev/null +++ b/peft/examples/sft/run_peft_qlora_deepspeed_stage3.sh @@ -0,0 +1,42 @@ +accelerate launch --config_file "configs/deepspeed_config_z3_qlora.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-qlora-dsz3" \ +--per_device_train_batch_size 2 \ +--per_device_eval_batch_size 2 \ +--gradient_accumulation_steps 2 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--bnb_4bit_quant_storage_dtype "bfloat16" \ No newline at end of file diff --git a/peft/examples/sft/run_peft_qlora_fsdp.sh b/peft/examples/sft/run_peft_qlora_fsdp.sh new file mode 100644 index 0000000000000000000000000000000000000000..4ed3218c8277fd0ab8f66c2b3ea3f2fe41e7fe04 --- /dev/null +++ b/peft/examples/sft/run_peft_qlora_fsdp.sh @@ -0,0 +1,42 @@ +accelerate launch --config_file "configs/fsdp_config_qlora.yaml" train.py \ +--seed 100 \ +--model_name_or_path "meta-llama/Llama-2-70b-hf" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "llama-sft-qlora-fsdp" \ +--per_device_train_batch_size 2 \ +--per_device_eval_batch_size 2 \ +--gradient_accumulation_steps 2 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_flash_attn True \ +--use_peft_lora True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "all-linear" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--bnb_4bit_quant_storage_dtype "bfloat16" \ No newline at end of file diff --git a/peft/examples/sft/run_unsloth_peft.sh b/peft/examples/sft/run_unsloth_peft.sh new file mode 100644 index 0000000000000000000000000000000000000000..97a4a6b520e1112a85cd25387f3cb46deb465f9e --- /dev/null +++ b/peft/examples/sft/run_unsloth_peft.sh @@ -0,0 +1,42 @@ +python train.py \ +--seed 100 \ +--model_name_or_path "mistralai/Mistral-7B-v0.1" \ +--dataset_name "smangrul/ultrachat-10k-chatml" \ +--chat_template_format "chatml" \ +--add_special_tokens False \ +--append_concat_token False \ +--splits "train,test" \ +--max_seq_len 2048 \ +--num_train_epochs 1 \ +--logging_steps 5 \ +--log_level "info" \ +--logging_strategy "steps" \ +--eval_strategy "epoch" \ +--save_strategy "epoch" \ +--push_to_hub \ +--hub_private_repo True \ +--hub_strategy "every_save" \ +--bf16 True \ +--packing True \ +--learning_rate 1e-4 \ +--lr_scheduler_type "cosine" \ +--weight_decay 1e-4 \ +--warmup_ratio 0.0 \ +--max_grad_norm 1.0 \ +--output_dir "mistral-sft-lora-unsloth" \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 8 \ +--gradient_accumulation_steps 8 \ +--gradient_checkpointing True \ +--use_reentrant True \ +--dataset_text_field "content" \ +--use_peft_lora True \ +--use_unsloth True \ +--lora_r 8 \ +--lora_alpha 16 \ +--lora_dropout 0.1 \ +--lora_target_modules "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj" \ +--use_4bit_quantization True \ +--use_nested_quant True \ +--bnb_4bit_compute_dtype "bfloat16" \ +--use_flash_attn True diff --git a/peft/examples/sft/train.py b/peft/examples/sft/train.py new file mode 100644 index 0000000000000000000000000000000000000000..5a34f69357b0505e84b16436e894a2ab924ce1a7 --- /dev/null +++ b/peft/examples/sft/train.py @@ -0,0 +1,159 @@ +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +from transformers import HfArgumentParser, set_seed +from trl import SFTConfig, SFTTrainer +from utils import create_and_prepare_model, create_datasets + + +# Define and parse arguments. +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + max_seq_length: Optional[int] = field( + default=512, + metadata={"help": "The maximum total input sequence length after tokenization."}, + ) + chat_template_format: Optional[str] = field( + default="none", + metadata={ + "help": "chatml|zephyr|none. Pass `none` if the dataset is already formatted with the chat template." + }, + ) + lora_alpha: Optional[int] = field(default=16) + lora_dropout: Optional[float] = field(default=0.1) + lora_r: Optional[int] = field(default=64) + lora_target_modules: Optional[str] = field( + default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj", + metadata={"help": "comma separated list of target modules to apply LoRA layers to"}, + ) + use_nested_quant: Optional[bool] = field( + default=False, + metadata={"help": "Activate nested quantization for 4bit base models"}, + ) + bnb_4bit_compute_dtype: Optional[str] = field( + default="float16", + metadata={"help": "Compute dtype for 4bit base models"}, + ) + bnb_4bit_quant_storage_dtype: Optional[str] = field( + default="uint8", + metadata={"help": "Quantization storage dtype for 4bit base models"}, + ) + bnb_4bit_quant_type: Optional[str] = field( + default="nf4", + metadata={"help": "Quantization type fp4 or nf4"}, + ) + use_flash_attn: Optional[bool] = field( + default=False, + metadata={"help": "Enables Flash attention for training."}, + ) + use_peft_lora: Optional[bool] = field( + default=False, + metadata={"help": "Enables PEFT LoRA for training."}, + ) + use_8bit_quantization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 8bit."}, + ) + use_4bit_quantization: Optional[bool] = field( + default=False, + metadata={"help": "Enables loading model in 4bit."}, + ) + use_reentrant: Optional[bool] = field( + default=False, + metadata={"help": "Gradient Checkpointing param. Refer the related docs"}, + ) + use_unsloth: Optional[bool] = field( + default=False, + metadata={"help": "Enables UnSloth for training."}, + ) + + +@dataclass +class DataTrainingArguments: + dataset_name: Optional[str] = field( + default="timdettmers/openassistant-guanaco", + metadata={"help": "The preference dataset to use."}, + ) + append_concat_token: Optional[bool] = field( + default=False, + metadata={"help": "If True, appends `eos_token_id` at the end of each sample being packed."}, + ) + add_special_tokens: Optional[bool] = field( + default=False, + metadata={"help": "If True, tokenizers adds special tokens to each sample being packed."}, + ) + splits: Optional[str] = field( + default="train,test", + metadata={"help": "Comma separate list of the splits to use from the dataset."}, + ) + + +def main(model_args, data_args, training_args): + # Set seed for reproducibility + set_seed(training_args.seed) + + # model + model, peft_config, tokenizer = create_and_prepare_model(model_args, data_args, training_args) + + # gradient ckpt + model.config.use_cache = not training_args.gradient_checkpointing + training_args.gradient_checkpointing = training_args.gradient_checkpointing and not model_args.use_unsloth + if training_args.gradient_checkpointing: + training_args.gradient_checkpointing_kwargs = {"use_reentrant": model_args.use_reentrant} + + training_args.dataset_kwargs = { + "append_concat_token": data_args.append_concat_token, + "add_special_tokens": data_args.add_special_tokens, + } + + # datasets + train_dataset, eval_dataset = create_datasets( + tokenizer, + data_args, + training_args, + apply_chat_template=model_args.chat_template_format != "none", + ) + + # trainer + trainer = SFTTrainer( + model=model, + processing_class=tokenizer, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=peft_config, + ) + trainer.accelerator.print(f"{trainer.model}") + if hasattr(trainer.model, "print_trainable_parameters"): + trainer.model.print_trainable_parameters() + + # train + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + trainer.train(resume_from_checkpoint=checkpoint) + + # saving final model + if trainer.is_fsdp_enabled: + trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT") + trainer.save_model() + + +if __name__ == "__main__": + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, SFTConfig)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + main(model_args, data_args, training_args) diff --git a/peft/examples/sft/utils.py b/peft/examples/sft/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ea0d36bb647628561a3d447c0fdd9360caa03ba9 --- /dev/null +++ b/peft/examples/sft/utils.py @@ -0,0 +1,219 @@ +import os +from enum import Enum + +import packaging.version +import torch +import transformers +from datasets import DatasetDict, load_dataset, load_from_disk +from datasets.builder import DatasetGenerationError +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, +) + +from peft import LoraConfig + + +DEFAULT_CHATML_CHAT_TEMPLATE = "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" +DEFAULT_ZEPHYR_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + + +class ZephyrSpecialTokens(str, Enum): + user = "<|user|>" + assistant = "<|assistant|>" + system = "<|system|>" + eos_token = "" + bos_token = "" + pad_token = "" + + @classmethod + def list(cls): + return [c.value for c in cls] + + +class ChatmlSpecialTokens(str, Enum): + user = "<|im_start|>user" + assistant = "<|im_start|>assistant" + system = "<|im_start|>system" + eos_token = "<|im_end|>" + bos_token = "" + pad_token = "" + + @classmethod + def list(cls): + return [c.value for c in cls] + + +def create_datasets(tokenizer, data_args, training_args, apply_chat_template=False): + def preprocess(samples): + batch = [] + for conversation in samples["messages"]: + batch.append(tokenizer.apply_chat_template(conversation, tokenize=False)) + return {"content": batch} + + raw_datasets = DatasetDict() + for split in data_args.splits.split(","): + try: + # Try first if dataset on a Hub repo + dataset = load_dataset(data_args.dataset_name, split=split) + except DatasetGenerationError: + # If not, check local dataset + dataset = load_from_disk(os.path.join(data_args.dataset_name, split)) + + if "train" in split: + raw_datasets["train"] = dataset + elif "test" in split: + raw_datasets["test"] = dataset + else: + raise ValueError(f"Split type {split} not recognized as one of test or train.") + + if apply_chat_template: + raw_datasets = raw_datasets.map( + preprocess, + batched=True, + remove_columns=raw_datasets["train"].column_names, + ) + + train_data = raw_datasets["train"] + valid_data = raw_datasets["test"] + print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") + print(f"A sample of train dataset: {train_data[0]}") + + return train_data, valid_data + + +def create_and_prepare_model(args, data_args, training_args): + if args.use_unsloth: + from unsloth import FastLanguageModel + bnb_config = None + quant_storage_dtype = None + + if ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and torch.distributed.get_world_size() > 1 + and args.use_unsloth + ): + raise NotImplementedError("Unsloth is not supported in distributed training") + + if args.use_4bit_quantization: + compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype) + quant_storage_dtype = getattr(torch, args.bnb_4bit_quant_storage_dtype) + + bnb_config = BitsAndBytesConfig( + load_in_4bit=args.use_4bit_quantization, + bnb_4bit_quant_type=args.bnb_4bit_quant_type, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.use_nested_quant, + bnb_4bit_quant_storage=quant_storage_dtype, + ) + + if compute_dtype == torch.float16 and args.use_4bit_quantization: + major, _ = torch.cuda.get_device_capability() + if major >= 8: + print("=" * 80) + print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") + print("=" * 80) + elif args.use_8bit_quantization: + bnb_config = BitsAndBytesConfig(load_in_8bit=args.use_8bit_quantization) + + if args.use_unsloth: + if torch.xpu.is_available(): + raise NotImplementedError("XPU hasn't supported unsloth yet") + # Load model + model, _ = FastLanguageModel.from_pretrained( + model_name=args.model_name_or_path, + max_seq_length=training_args.max_seq_length, + dtype=None, + load_in_4bit=args.use_4bit_quantization, + ) + else: + torch_dtype = ( + quant_storage_dtype if quant_storage_dtype and quant_storage_dtype.is_floating_point else torch.float32 + ) + + # Prepare model loading arguments + model_kwargs = { + "trust_remote_code": True, + "torch_dtype": torch_dtype, + } + if args.use_flash_attn: + if torch.xpu.is_available(): + print("XPU hasn't supported flash_attn yet, use eager implementation instead.") + model_kwargs["attn_implementation"] = "eager" + else: + model_kwargs["attn_implementation"] = "flash_attention_2" + + # Only add quantization_config if bnb_config is not None + if bnb_config is not None: + model_kwargs["quantization_config"] = bnb_config + + model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs) + + peft_config = None + chat_template = None + if args.use_peft_lora and not args.use_unsloth: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + ) + + special_tokens = None + chat_template = None + if args.chat_template_format == "chatml": + special_tokens = ChatmlSpecialTokens + chat_template = DEFAULT_CHATML_CHAT_TEMPLATE + elif args.chat_template_format == "zephyr": + special_tokens = ZephyrSpecialTokens + chat_template = DEFAULT_ZEPHYR_CHAT_TEMPLATE + + if special_tokens is not None: + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, + pad_token=special_tokens.pad_token.value, + bos_token=special_tokens.bos_token.value, + eos_token=special_tokens.eos_token.value, + additional_special_tokens=special_tokens.list(), + trust_remote_code=True, + ) + tokenizer.chat_template = chat_template + + # make embedding resizing configurable? + # Transformers 4.46.0+ defaults uses mean_resizing by default, which fails with QLoRA + FSDP because the + # embedding could be on meta device, therefore, we set mean_resizing=False in that case (i.e. the status quo + # ante). See https://github.com/huggingface/accelerate/issues/1620. + uses_transformers_4_46 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.46.0") + uses_fsdp = os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true" + # Check if the model is quantized + is_quantized = (bnb_config is not None) or (getattr(model, "hf_quantizer", None) is not None) + if is_quantized and uses_fsdp and uses_transformers_4_46: + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8, mean_resizing=False) + else: + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + else: + tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + + if args.use_unsloth: + # Do model patching and add fast LoRA weights + model = FastLanguageModel.get_peft_model( + model, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + target_modules=args.lora_target_modules.split(",") + if args.lora_target_modules != "all-linear" + else args.lora_target_modules, + use_gradient_checkpointing=training_args.gradient_checkpointing, + random_state=training_args.seed, + max_seq_length=training_args.max_seq_length, + ) + + return model, peft_config, tokenizer diff --git a/peft/examples/shira_finetuning/README.md b/peft/examples/shira_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..81bbf66374be4e4d4d0355a7e730992e20734982 --- /dev/null +++ b/peft/examples/shira_finetuning/README.md @@ -0,0 +1,73 @@ +# Sparse High Rank Adapters + +## Introduction +Sparse High Rank Adapters or [SHiRA](https://arxiv.org/abs/2406.13175) is an alternate type of adapter and has been found to have significant advantages over the low rank adapters. Specifically, SHiRA achieves better accuracy than LoRA for a variety of vision and language tasks. It also offers simpler and higher quality multi-adapter fusion by significantly reducing concept loss, a common problem faced by low rank adapters. SHiRA directly finetunes a small number of the base model's parameters to finetune the model on any adaptation task. + +## Quick start +```python +import torch +from peft import ShiraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +dataset = load_dataset("imdb", split="train[:1%]") +shira_config = ShiraConfig( + r=32, +) +peft_model = get_peft_model(model, shira_config) +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("shira-opt-350m") +``` + +For more options and a more detailed example code, you can refer to shira finetuning script. +Run the script simply by running: +```bash +python3 examples/shira_finetuning/shira_finetuning.py --base_model facebook/opt-350m +``` + +If you want to run DDP by [accelerate](https://huggingface.co/docs/accelerate/en/index), please run `accelerate config` to set your ddp config, and run: +```bash +accelerate launch examples/shira_finetuning/shira_finetuning.py --base_model facebook/opt-350m +``` +please add `--device_map cpu` if you want to run finetune on CPU. + +If you want to train SHiRA with a custom sparse mask function which requires custom keyword arguments, please see the definition of `custom_random_mask_function_with_custom_kwargs` function provided in the `shira_fintuning.py` script. You can run this code using the `--use_custom_random_mask_function_with_custom_kwargs` argument. Without this argument, SHiRA defaults to a random sparse mask. Please run the code as follows. : +```bash +python3 examples/shira_finetuning/shira_finetuning.py --base_model facebook/opt-350m --use_custom_random_mask_function_with_custom_kwargs + +``` + + +## Use the model +You can load and use the model as any other 🤗 PEFT model +```python +from peft import PeftModel +from transformers import AutoTokenizer, AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +shira_model = PeftModel.from_pretrained(model, "shira-opt-350m") +``` + +## Citation +``` +@inproceedings{NEURIPS2024_18c0102c, + author = {Bhardwaj, Kartikeya and Pandey, Nilesh Prasad and Priyadarshi, Sweta and Ganapathy, Viswanath and Kadambi, Shreya and Esteves, Rafael and Borse, Shubhankar and Whatmough, Paul and Garrepalli, Risheek and Van Baalen, Mart and Teague, Harris and Nagel, Markus}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {A. Globerson and L. Mackey and D. Belgrave and A. Fan and U. Paquet and J. Tomczak and C. Zhang}, + pages = {13685--13715}, + publisher = {Curran Associates, Inc.}, + title = {Sparse High Rank Adapters}, + url = {https://proceedings.neurips.cc/paper_files/paper/2024/file/18c0102cb7f1a02c14f0929089b2e576-Paper-Conference.pdf}, + volume = {37}, + year = {2024} +} +``` diff --git a/peft/examples/shira_finetuning/shira_finetuning.py b/peft/examples/shira_finetuning/shira_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..b1f32d7cb62412fb79dbeebdfd6a7cc6998eb386 --- /dev/null +++ b/peft/examples/shira_finetuning/shira_finetuning.py @@ -0,0 +1,217 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Optional + +import torch +import transformers +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed + +from peft import ( + PeftModel, + ShiraConfig, + get_peft_model, +) + + +def train( + base_model: str = "path/to/model", + data_path: str = "yahma/alpaca-cleaned", + output_dir: str = "shira", + batch_size: int = 16, + num_epochs: int = 1, + learning_rate: float = 3e-4, + cutoff_len: int = 256, + val_set_size: int = 16, + eval_step: int = 100, + save_step: int = 100, + device_map: str = "auto", + shira_r: int = 32, + shira_target_modules: list[str] = None, + torch_dtype: str = "float16", + seed: Optional[int] = None, + use_custom_random_mask_function_with_custom_kwargs: Optional[bool] = False, +): + # Set device_map to the right place when enabling DDP. + world_size = int(os.environ.get("WORLD_SIZE", 0)) or int(os.environ.get("PMI_SIZE", 0)) + if world_size > 1 and device_map != "cpu": + from accelerate import Accelerator + + device_map = {"": Accelerator().process_index} + # Set seed + if seed is not None: + set_seed(seed) + model_kwargs = {"torch_dtype": getattr(torch, torch_dtype), "device_map": device_map} + model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs) + + tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) + # For some tokenizer with no pad token like llama + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize(prompt, add_eos_token=True): + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + def generate_and_tokenize_prompt(example): + full_prompt = generate_prompt(example) + tokenized_full_prompt = tokenize(full_prompt) + return tokenized_full_prompt + + def custom_random_mask_function_with_custom_kwargs(custom_arg): + def mask_fn(base_layer, r): + """ + This mask function is similar to the random_mask provided in src/peft/tuners/shira/mask_functions.py except the seed is derived from custom_kwargs. + Please use this as an example to create your own custom sparse masks that may use custom_kwargs. Remember, for a pretrained weight with shape m, n, + mask_fn must return only one mask (shape: m, n) which must be binary 0 or 1 with num_shira_parameters = r(m+n) for linear layers. Device and dtype + of mask must be same as base layer's weight's device and dtype. + """ + new_seed = custom_arg + shape = base_layer.weight.shape + num_shira_weights = r * (shape[0] + shape[1]) + random_generator = torch.Generator() + random_generator.manual_seed(new_seed) + + idx = (torch.randperm(base_layer.weight.numel(), generator=random_generator)[:num_shira_weights]).to( + base_layer.weight.device + ) + val = torch.ones_like(idx.type(base_layer.weight.dtype)) + mask = torch.zeros_like(base_layer.weight.view(1, -1)) + mask = mask.scatter_(1, idx.unsqueeze(0), val.unsqueeze(0)).view(shape) + + return mask + + return mask_fn + + mask_type = "random" if not use_custom_random_mask_function_with_custom_kwargs else "custom" + config = ShiraConfig( + r=shira_r, + mask_type=mask_type, + target_modules=shira_target_modules, + task_type="CAUSAL_LM", + ) + if use_custom_random_mask_function_with_custom_kwargs: + custom_arg = 120 + custom_mask_fn = custom_random_mask_function_with_custom_kwargs(custom_arg) + config.mask_fn = custom_mask_fn + + model = get_peft_model(model, config) + + data = load_dataset(data_path) + + train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42) + train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) + + trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=batch_size, + warmup_steps=100, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + logging_steps=100, + optim="adamw_torch", + eval_strategy="steps", + save_strategy="steps", + eval_steps=eval_step, + save_steps=save_step, + output_dir=output_dir, + save_total_limit=3, + load_best_model_at_end=True, + ddp_find_unused_parameters=False if world_size > 1 else None, + ), + data_collator=transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + trainer.train() + model.save_pretrained(output_dir) + + # Delete the model and load it again from the checkpoint. + del model + model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs) + model = PeftModel.from_pretrained(model, output_dir) + + +def generate_prompt(example): + return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. + ### Instruction: + {example["instruction"]} + ### Response: + {example["output"]}""" + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--base_model", type=str, default="path/to/model") + parser.add_argument("--data_path", type=str, default="yahma/alpaca-cleaned") + parser.add_argument("--output_dir", type=str, default="shira") + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--num_epochs", type=int, default=1) + parser.add_argument("--learning_rate", type=float, default=3e-4) + parser.add_argument("--cutoff_len", type=int, default=256) + parser.add_argument("--val_set_size", type=int, default=16) + parser.add_argument("--eval_step", type=int, default=100) + parser.add_argument("--save_step", type=int, default=100) + parser.add_argument("--device_map", type=str, default="auto") + parser.add_argument("--shira_r", type=int, default=32) + parser.add_argument("--shira_target_modules", type=str, default=None) + parser.add_argument("--torch_dtype", type=str, default="float16") + parser.add_argument("--seed", type=int, default=None) + parser.add_argument("--use_custom_random_mask_function_with_custom_kwargs", action="store_true") + + args = parser.parse_args() + + train( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + eval_step=args.eval_step, + save_step=args.save_step, + device_map=args.device_map, + shira_r=args.shira_r, + shira_target_modules=args.shira_target_modules, + torch_dtype=args.torch_dtype, + seed=args.seed, + use_custom_random_mask_function_with_custom_kwargs=args.use_custom_random_mask_function_with_custom_kwargs, + ) diff --git a/peft/examples/stable_diffusion/convert_sd_adapter_to_peft.py b/peft/examples/stable_diffusion/convert_sd_adapter_to_peft.py new file mode 100644 index 0000000000000000000000000000000000000000..61712b1eea860347da08d6f30c7c51954f6e4194 --- /dev/null +++ b/peft/examples/stable_diffusion/convert_sd_adapter_to_peft.py @@ -0,0 +1,514 @@ +import argparse +import json +import logging +import os +from collections import Counter +from dataclasses import dataclass +from operator import attrgetter +from typing import Optional, Union + +import safetensors +import torch +import torch.nn as nn +from diffusers import UNet2DConditionModel +from transformers import CLIPTextModel + +from peft import LoHaConfig, LoKrConfig, LoraConfig, PeftType, get_peft_model, set_peft_model_state_dict +from peft.tuners.lokr.layer import factorization + + +# Default kohya_ss LoRA replacement modules +# https://github.com/kohya-ss/sd-scripts/blob/c924c47f374ac1b6e33e71f82948eb1853e2243f/networks/lora.py#L661 +UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"] +UNET_TARGET_REPLACE_MODULE_CONV2D_3X3 = ["ResnetBlock2D", "Downsample2D", "Upsample2D"] +TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"] +PREFIX_UNET = "lora_unet" +PREFIX_TEXT_ENCODER = "lora_te" + + +@dataclass +class LoRAInfo: + kohya_key: str + peft_key: str + alpha: Optional[float] = None + rank: Optional[int] = None + lora_A: Optional[torch.Tensor] = None + lora_B: Optional[torch.Tensor] = None + + def peft_state_dict(self) -> dict[str, torch.Tensor]: + if self.lora_A is None or self.lora_B is None: + raise ValueError("At least one of lora_A or lora_B is None, they must both be provided") + return { + f"base_model.model.{self.peft_key}.lora_A.weight": self.lora_A, + f"base_model.model.{self.peft_key}.lora_B.weight": self.lora_B, + } + + +@dataclass +class LoHaInfo: + kohya_key: str + peft_key: str + alpha: Optional[float] = None + rank: Optional[int] = None + hada_w1_a: Optional[torch.Tensor] = None + hada_w1_b: Optional[torch.Tensor] = None + hada_w2_a: Optional[torch.Tensor] = None + hada_w2_b: Optional[torch.Tensor] = None + hada_t1: Optional[torch.Tensor] = None + hada_t2: Optional[torch.Tensor] = None + + def peft_state_dict(self) -> dict[str, torch.Tensor]: + if self.hada_w1_a is None or self.hada_w1_b is None or self.hada_w2_a is None or self.hada_w2_b is None: + raise ValueError( + "At least one of hada_w1_a, hada_w1_b, hada_w2_a, hada_w2_b is missing, they all must be provided" + ) + state_dict = { + f"base_model.model.{self.peft_key}.hada_w1_a": self.hada_w1_a, + f"base_model.model.{self.peft_key}.hada_w1_b": self.hada_w1_b, + f"base_model.model.{self.peft_key}.hada_w2_a": self.hada_w2_a, + f"base_model.model.{self.peft_key}.hada_w2_b": self.hada_w2_b, + } + if not ( + (self.hada_t1 is None and self.hada_t2 is None) or (self.hada_t1 is not None and self.hada_t2 is not None) + ): + raise ValueError("hada_t1 and hada_t2 must be either both present or not present at the same time") + if self.hada_t1 is not None and self.hada_t2 is not None: + state_dict[f"base_model.model.{self.peft_key}.hada_t1"] = self.hada_t1 + state_dict[f"base_model.model.{self.peft_key}.hada_t2"] = self.hada_t2 + return state_dict + + +@dataclass +class LoKrInfo: + kohya_key: str + peft_key: str + alpha: Optional[float] = None + rank: Optional[int] = None + lokr_w1: Optional[torch.Tensor] = None + lokr_w1_a: Optional[torch.Tensor] = None + lokr_w1_b: Optional[torch.Tensor] = None + lokr_w2: Optional[torch.Tensor] = None + lokr_w2_a: Optional[torch.Tensor] = None + lokr_w2_b: Optional[torch.Tensor] = None + lokr_t2: Optional[torch.Tensor] = None + + def peft_state_dict(self) -> dict[str, torch.Tensor]: + if (self.lokr_w1 is None) and ((self.lokr_w1_a is None) or (self.lokr_w1_b is None)): + raise ValueError("Either lokr_w1 or both lokr_w1_a and lokr_w1_b should be provided") + + if (self.lokr_w2 is None) and ((self.lokr_w2_a is None) or (self.lokr_w2_b is None)): + raise ValueError("Either lokr_w2 or both lokr_w2_a and lokr_w2_b should be provided") + + state_dict = {} + + if self.lokr_w1 is not None: + state_dict[f"base_model.model.{self.peft_key}.lokr_w1"] = self.lokr_w1 + elif self.lokr_w1_a is not None: + state_dict[f"base_model.model.{self.peft_key}.lokr_w1_a"] = self.lokr_w1_a + state_dict[f"base_model.model.{self.peft_key}.lokr_w1_b"] = self.lokr_w1_b + + if self.lokr_w2 is not None: + state_dict[f"base_model.model.{self.peft_key}.lokr_w2"] = self.lokr_w2 + elif self.lokr_w2_a is not None: + state_dict[f"base_model.model.{self.peft_key}.lokr_w2_a"] = self.lokr_w2_a + state_dict[f"base_model.model.{self.peft_key}.lokr_w2_b"] = self.lokr_w2_b + + if self.lokr_t2 is not None: + state_dict[f"base_model.model.{self.peft_key}.lokr_t2"] = self.lokr_t2 + + return state_dict + + +def construct_peft_loraconfig(info: dict[str, LoRAInfo], **kwargs) -> LoraConfig: + """Constructs LoraConfig from data extracted from adapter checkpoint + + Args: + info (Dict[str, LoRAInfo]): Information extracted from adapter checkpoint + + Returns: + LoraConfig: config for constructing LoRA + """ + + # Unpack all ranks and alphas + ranks = {key: val.rank for key, val in info.items()} + alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()} + + # Determine which modules needs to be transformed + target_modules = sorted(info.keys()) + + # Determine most common rank and alpha + r = int(Counter(ranks.values()).most_common(1)[0][0]) + lora_alpha = Counter(alphas.values()).most_common(1)[0][0] + + # Determine which modules have different rank and alpha + rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0])) + alpha_pattern = dict(sorted(filter(lambda x: x[1] != lora_alpha, alphas.items()), key=lambda x: x[0])) + + config = LoraConfig( + r=r, + lora_alpha=lora_alpha, + target_modules=target_modules, + lora_dropout=0.0, + bias="none", + init_lora_weights=False, + rank_pattern=rank_pattern, + alpha_pattern=alpha_pattern, + ) + + return config + + +def construct_peft_lohaconfig(info: dict[str, LoHaInfo], **kwargs) -> LoHaConfig: + """Constructs LoHaConfig from data extracted from adapter checkpoint + + Args: + info (Dict[str, LoHaInfo]): Information extracted from adapter checkpoint + + Returns: + LoHaConfig: config for constructing LoHA + """ + + # Unpack all ranks and alphas + ranks = {x[0]: x[1].rank for x in info.items()} + alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()} + + # Determine which modules needs to be transformed + target_modules = sorted(info.keys()) + + # Determine most common rank and alpha + r = int(Counter(ranks.values()).most_common(1)[0][0]) + alpha = Counter(alphas.values()).most_common(1)[0][0] + + # Determine which modules have different rank and alpha + rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0])) + alpha_pattern = dict(sorted(filter(lambda x: x[1] != alpha, alphas.items()), key=lambda x: x[0])) + + # Determine whether any of modules have effective conv2d decomposition + use_effective_conv2d = any((val.hada_t1 is not None) or (val.hada_t2 is not None) for val in info.values()) + + config = LoHaConfig( + r=r, + alpha=alpha, + target_modules=target_modules, + rank_dropout=0.0, + module_dropout=0.0, + init_weights=False, + rank_pattern=rank_pattern, + alpha_pattern=alpha_pattern, + use_effective_conv2d=use_effective_conv2d, + ) + + return config + + +def construct_peft_lokrconfig(info: dict[str, LoKrInfo], decompose_factor: int = -1, **kwargs) -> LoKrConfig: + """Constructs LoKrConfig from data extracted from adapter checkpoint + + Args: + info (Dict[str, LoKrInfo]): Information extracted from adapter checkpoint + + Returns: + LoKrConfig: config for constructing LoKr + """ + + # Unpack all ranks and alphas + ranks = {x[0]: x[1].rank for x in info.items()} + alphas = {x[0]: x[1].alpha or x[1].rank for x in info.items()} + + # Determine which modules needs to be transformed + target_modules = sorted(info.keys()) + + # Determine most common rank and alpha + r = int(Counter(ranks.values()).most_common(1)[0][0]) + alpha = Counter(alphas.values()).most_common(1)[0][0] + + # Determine which modules have different rank and alpha + rank_pattern = dict(sorted(filter(lambda x: x[1] != r, ranks.items()), key=lambda x: x[0])) + alpha_pattern = dict(sorted(filter(lambda x: x[1] != alpha, alphas.items()), key=lambda x: x[0])) + + # Determine whether any of modules have effective conv2d decomposition + use_effective_conv2d = any((val.lokr_t2 is not None) for val in info.values()) + + # decompose_both should be enabled if any w1 matrix in any layer is decomposed into 2 + decompose_both = any((val.lokr_w1_a is not None and val.lokr_w1_b is not None) for val in info.values()) + + # Determining decompose factor is a bit tricky (but it is most often -1) + # Check that decompose_factor is equal to provided + for val in info.values(): + # Determine shape of first matrix + if val.lokr_w1 is not None: + w1_shape = tuple(val.lokr_w1.shape) + else: + w1_shape = (val.lokr_w1_a.shape[0], val.lokr_w1_b.shape[1]) + + # Determine shape of second matrix + if val.lokr_w2 is not None: + w2_shape = tuple(val.lokr_w2.shape[:2]) + elif val.lokr_t2 is not None: + w2_shape = (val.lokr_w2_a.shape[1], val.lokr_w2_b.shape[1]) + else: + # We may iterate over Conv2d layer, for which second item in shape is multiplied by ksize^2 + w2_shape = (val.lokr_w2_a.shape[0], val.lokr_w2_b.shape[1]) + + # We need to check, whether decompose_factor is really -1 or not + shape = (w1_shape[0], w2_shape[0]) + if factorization(shape[0] * shape[1], factor=-1) != shape: + raise ValueError("Cannot infer decompose_factor, probably it is not equal to -1") + + config = LoKrConfig( + r=r, + alpha=alpha, + target_modules=target_modules, + rank_dropout=0.0, + module_dropout=0.0, + init_weights=False, + rank_pattern=rank_pattern, + alpha_pattern=alpha_pattern, + use_effective_conv2d=use_effective_conv2d, + decompose_both=decompose_both, + decompose_factor=decompose_factor, + ) + + return config + + +def combine_peft_state_dict(info: dict[str, Union[LoRAInfo, LoHaInfo]]) -> dict[str, torch.Tensor]: + result = {} + for key_info in info.values(): + result.update(key_info.peft_state_dict()) + return result + + +def detect_adapter_type(keys: list[str]) -> PeftType: + # Detect type of adapter by keys + # Inspired by this: + # https://github.com/bmaltais/kohya_ss/blob/ed4e3b0239a40506de9a17e550e6cf2d0b867a4f/tools/lycoris_utils.py#L312 + for key in keys: + if "alpha" in key: + continue + elif any(x in key for x in ["lora_down", "lora_up"]): + # LoRA + return PeftType.LORA + elif any(x in key for x in ["hada_w1", "hada_w2", "hada_t1", "hada_t2"]): + # LoHa may have the following keys: + # hada_w1_a, hada_w1_b, hada_w2_a, hada_w2_b, hada_t1, hada_t2 + return PeftType.LOHA + elif any(x in key for x in ["lokr_w1", "lokr_w2", "lokr_t1", "lokr_t2"]): + # LoKr may have the following keys: + # lokr_w1, lokr_w2, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t1, lokr_t2 + return PeftType.LOKR + elif "diff" in key: + raise ValueError("Currently full diff adapters are not implemented") + else: + raise ValueError("Unknown adapter type, probably not implemented") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--sd_checkpoint", default=None, type=str, required=True, help="SD checkpoint to use") + + parser.add_argument( + "--adapter_path", + default=None, + type=str, + required=True, + help="Path to downloaded adapter to convert", + ) + + parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output peft adapter.") + + parser.add_argument("--half", action="store_true", help="Save weights in half precision.") + parser.add_argument( + "--loha_conv2d_weights_fix", + action="store_true", + help="""LoHa checkpoints trained with lycoris-lora<=1.9.0 contain a bug described in this PR https://github.com/KohakuBlueleaf/LyCORIS/pull/115. + This option fixes this bug during weight conversion (replaces hada_t2 with hada_t1 for Conv2d 3x3 layers). + The output results may differ from webui, but in general, they should be better in terms of quality. + This option should be set to True in case the provided checkpoint has been trained with lycoris-lora version for which the mentioned PR wasn't merged. + This option should be set to False in case the provided checkpoint has been trained with lycoris-lora version for which the mentioned PR is merged or full compatibility with webui outputs is required.""", + ) + args = parser.parse_args() + + # Load all models that we need to add adapter to + text_encoder = CLIPTextModel.from_pretrained(args.sd_checkpoint, subfolder="text_encoder") + unet = UNet2DConditionModel.from_pretrained(args.sd_checkpoint, subfolder="unet") + + # Construct possible mapping from kohya keys to peft keys + models_keys = {} + for model, model_key, model_name in [ + (text_encoder, PREFIX_TEXT_ENCODER, "text_encoder"), + (unet, PREFIX_UNET, "unet"), + ]: + models_keys.update( + { + f"{model_key}.{peft_key}".replace(".", "_"): peft_key + for peft_key in (x[0] for x in model.named_modules()) + } + ) + + # Store conversion info (model_type -> peft_key -> LoRAInfo | LoHaInfo | LoKrInfo) + adapter_info: dict[str, dict[str, Union[LoRAInfo, LoHaInfo, LoKrInfo]]] = { + "text_encoder": {}, + "unet": {}, + } + + # Store decompose_factor for LoKr + decompose_factor = -1 + + # Open adapter checkpoint + with safetensors.safe_open(args.adapter_path, framework="pt", device="cpu") as f: + # Extract information about adapter structure + metadata = f.metadata() + + # It may be difficult to determine rank for LoKr adapters + # If checkpoint was trained with large rank it may not be utilized during weights creation at all + # So we need to get it from checkpoint metadata (along with decompose_factor) + rank, conv_rank = None, None + if metadata is not None: + rank = metadata.get("ss_network_dim", None) + rank = int(rank) if rank else None + if "ss_network_args" in metadata: + network_args = json.loads(metadata["ss_network_args"]) + conv_rank = network_args.get("conv_dim", None) + conv_rank = int(conv_rank) if conv_rank else rank + decompose_factor = network_args.get("factor", -1) + decompose_factor = int(decompose_factor) + + # Detect adapter type based on keys + adapter_type = detect_adapter_type(f.keys()) + adapter_info_cls = { + PeftType.LORA: LoRAInfo, + PeftType.LOHA: LoHaInfo, + PeftType.LOKR: LoKrInfo, + }[adapter_type] + + # Iterate through available info and unpack all the values + for key in f.keys(): + kohya_key, kohya_type = key.split(".")[:2] + + # Find which model this key belongs to + if kohya_key.startswith(PREFIX_TEXT_ENCODER): + model_type, model = "text_encoder", text_encoder + elif kohya_key.startswith(PREFIX_UNET): + model_type, model = "unet", unet + else: + raise ValueError(f"Cannot determine model for key: {key}") + + # Find corresponding peft key + if kohya_key not in models_keys: + raise ValueError(f"Cannot find corresponding key for diffusers/transformers model: {kohya_key}") + peft_key = models_keys[kohya_key] + + # Retrieve corresponding layer of model + layer = attrgetter(peft_key)(model) + + # Create a corresponding adapter info + if peft_key not in adapter_info[model_type]: + adapter_info[model_type][peft_key] = adapter_info_cls(kohya_key=kohya_key, peft_key=peft_key) + + tensor = f.get_tensor(key) + if kohya_type == "alpha": + adapter_info[model_type][peft_key].alpha = tensor.item() + elif kohya_type == "lora_down": + adapter_info[model_type][peft_key].lora_A = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "lora_up": + adapter_info[model_type][peft_key].lora_B = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[1] + elif kohya_type == "hada_w1_a": + adapter_info[model_type][peft_key].hada_w1_a = tensor + elif kohya_type == "hada_w1_b": + adapter_info[model_type][peft_key].hada_w1_b = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "hada_w2_a": + adapter_info[model_type][peft_key].hada_w2_a = tensor + elif kohya_type == "hada_w2_b": + adapter_info[model_type][peft_key].hada_w2_b = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type in {"hada_t1", "hada_t2"}: + if args.loha_conv2d_weights_fix: + if kohya_type == "hada_t1": + # This code block fixes a bug that exists for some LoHa checkpoints + # that resulted in accidentally using hada_t1 weight instead of hada_t2, see + # https://github.com/KohakuBlueleaf/LyCORIS/pull/115 + adapter_info[model_type][peft_key].hada_t1 = tensor + adapter_info[model_type][peft_key].hada_t2 = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + else: + if kohya_type == "hada_t1": + adapter_info[model_type][peft_key].hada_t1 = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "hada_t2": + adapter_info[model_type][peft_key].hada_t2 = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "lokr_t2": + adapter_info[model_type][peft_key].lokr_t2 = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "lokr_w1": + adapter_info[model_type][peft_key].lokr_w1 = tensor + if isinstance(layer, nn.Linear) or ( + isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1) + ): + adapter_info[model_type][peft_key].rank = rank + elif isinstance(layer, nn.Conv2d): + adapter_info[model_type][peft_key].rank = conv_rank + elif kohya_type == "lokr_w2": + adapter_info[model_type][peft_key].lokr_w2 = tensor + if isinstance(layer, nn.Linear) or ( + isinstance(layer, nn.Conv2d) and tuple(layer.weight.shape[2:]) == (1, 1) + ): + adapter_info[model_type][peft_key].rank = rank + elif isinstance(layer, nn.Conv2d): + adapter_info[model_type][peft_key].rank = conv_rank + elif kohya_type == "lokr_w1_a": + adapter_info[model_type][peft_key].lokr_w1_a = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[1] + elif kohya_type == "lokr_w1_b": + adapter_info[model_type][peft_key].lokr_w1_b = tensor + adapter_info[model_type][peft_key].rank = tensor.shape[0] + elif kohya_type == "lokr_w2_a": + adapter_info[model_type][peft_key].lokr_w2_a = tensor + elif kohya_type == "lokr_w2_b": + adapter_info[model_type][peft_key].lokr_w2_b = tensor + else: + raise ValueError(f"Unknown weight name in key: {key} - {kohya_type}") + + # Get function which will create adapter config based on extracted info + construct_config_fn = { + PeftType.LORA: construct_peft_loraconfig, + PeftType.LOHA: construct_peft_lohaconfig, + PeftType.LOKR: construct_peft_lokrconfig, + }[adapter_type] + + # Process each model sequentially + for model, model_name in [(text_encoder, "text_encoder"), (unet, "unet")]: + # Skip model if no data was provided + if len(adapter_info[model_name]) == 0: + continue + + config = construct_config_fn(adapter_info[model_name], decompose_factor=decompose_factor) + + # Output warning for LoHa with use_effective_conv2d + if ( + isinstance(config, LoHaConfig) + and getattr(config, "use_effective_conv2d", False) + and args.loha_conv2d_weights_fix is False + ): + logging.warning( + 'lycoris-lora<=1.9.0 LoHa implementation contains a bug, which can be fixed with "--loha_conv2d_weights_fix".\n' + "For more info, please refer to https://github.com/huggingface/peft/pull/1021 and https://github.com/KohakuBlueleaf/LyCORIS/pull/115" + ) + + model = get_peft_model(model, config) + missing_keys, unexpected_keys = set_peft_model_state_dict( + model, combine_peft_state_dict(adapter_info[model_name]) + ) + if len(unexpected_keys) > 0: + raise ValueError(f"Unexpected keys {unexpected_keys} found during conversion") + + if args.half: + model.to(torch.float16) + + # Save model to disk + model.save_pretrained(os.path.join(args.dump_path, model_name)) diff --git a/peft/examples/stable_diffusion/inc_flux_lora_hpu.py b/peft/examples/stable_diffusion/inc_flux_lora_hpu.py new file mode 100644 index 0000000000000000000000000000000000000000..5c0b24928100c3756a3c7148a740dfeb48cbe9f2 --- /dev/null +++ b/peft/examples/stable_diffusion/inc_flux_lora_hpu.py @@ -0,0 +1,67 @@ +""" +This exampe demonstrates loading of LoRA adapter (via PEFT) into an FP8 INC-quantized FLUX model. + +More info on Intel Neural Compressor (INC) FP8 quantization is available at: +https://github.com/intel/neural-compressor/tree/master/examples/helloworld/fp8_example + +Requirements: +pip install optimum-habana sentencepiece neural-compressor[pt] peft +""" + +import importlib + +import torch +from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare + + +# Checks if HPU device is available +# Adapted from https://github.com/huggingface/accelerate/blob/b451956fd69a135efc283aadaa478f0d33fcbe6a/src/accelerate/utils/imports.py#L435 +def is_hpu_available(): + if ( + importlib.util.find_spec("habana_frameworks") is None + or importlib.util.find_spec("habana_frameworks.torch") is None + ): + return False + + import habana_frameworks.torch # noqa: F401 + + return hasattr(torch, "hpu") and torch.hpu.is_available() + + +# Ensure HPU device is available before proceeding +if is_hpu_available(): + from optimum.habana.diffusers import GaudiFluxPipeline +else: + raise RuntimeError("HPU device not found. This code requires Intel Gaudi device to run.") + +# Example: FLUX model inference on HPU via optimum-habana pipeline +hpu_configs = { + "use_habana": True, + "use_hpu_graphs": True, + "sdp_on_bf16": True, + "gaudi_config": "Habana/stable-diffusion", +} +pipe = GaudiFluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, **hpu_configs) +prompt = "A picture of sks dog in a bucket" + +# Quantize FLUX transformer to FP8 using INC (Intel Neural Compressor) +quant_configs = { + "mode": "AUTO", + "observer": "maxabs", + "scale_method": "maxabs_hw", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, + "dump_stats_path": "/tmp/hqt_output/measure", +} +config = FP8Config(**quant_configs) +pipe.transformer = prepare(pipe.transformer, config) +pipe(prompt) +finalize_calibration(pipe.transformer) +pipe.transformer = convert(pipe.transformer) + +# Load LoRA weights with PEFT +pipe.load_lora_weights("dsocek/lora-flux-dog", adapter_name="user_lora") + +# Run inference +image = pipe(prompt).images[0] +image.save("dog.png") diff --git a/peft/examples/stable_diffusion/train_dreambooth.py b/peft/examples/stable_diffusion/train_dreambooth.py new file mode 100644 index 0000000000000000000000000000000000000000..9af5fee335d0ce210bd85ae45c2185cd6cffa471 --- /dev/null +++ b/peft/examples/stable_diffusion/train_dreambooth.py @@ -0,0 +1,1275 @@ +import argparse +import gc +import hashlib +import itertools +import logging +import math +import os +import threading +import warnings +from pathlib import Path +from typing import Union + +import datasets +import diffusers +import numpy as np +import psutil +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +import transformers +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import set_seed +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) +from diffusers.optimization import get_scheduler +from diffusers.utils import check_min_version +from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import HfApi +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +from peft import LoHaConfig, LoKrConfig, LoraConfig, get_peft_model + + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.10.0.dev0") + +logger = get_logger(__name__) + +UNET_TARGET_MODULES = [ + "to_q", + "to_k", + "to_v", + "proj", + "proj_in", + "proj_out", + "conv", + "conv1", + "conv2", + "conv_shortcut", + "to_out.0", + "time_emb_proj", + "ff.net.2", +] + +TEXT_ENCODER_TARGET_MODULES = ["fc1", "fc2", "q_proj", "k_proj", "v_proj", "out_proj"] + + +def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): + text_encoder_config = PretrainedConfig.from_pretrained( + pretrained_model_name_or_path, + subfolder="text_encoder", + revision=revision, + ) + model_class = text_encoder_config.architectures[0] + + if model_class == "CLIPTextModel": + from transformers import CLIPTextModel + + return CLIPTextModel + elif model_class == "RobertaSeriesModelWithTransformation": + from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation + + return RobertaSeriesModelWithTransformation + else: + raise ValueError(f"{model_class} is not supported.") + + +def create_unet_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig]: + if args.adapter == "full": + raise ValueError("Cannot create unet adapter config for full parameter") + + if args.adapter == "lora": + config = LoraConfig( + r=args.unet_r, + lora_alpha=args.unet_alpha, + target_modules=UNET_TARGET_MODULES, + lora_dropout=args.unet_dropout, + bias=args.unet_bias, + init_lora_weights=True, + ) + elif args.adapter == "loha": + config = LoHaConfig( + r=args.unet_r, + alpha=args.unet_alpha, + target_modules=UNET_TARGET_MODULES, + rank_dropout=args.unet_rank_dropout, + module_dropout=args.unet_module_dropout, + use_effective_conv2d=args.unet_use_effective_conv2d, + init_weights=True, + ) + elif args.adapter == "lokr": + config = LoKrConfig( + r=args.unet_r, + alpha=args.unet_alpha, + target_modules=UNET_TARGET_MODULES, + rank_dropout=args.unet_rank_dropout, + module_dropout=args.unet_module_dropout, + use_effective_conv2d=args.unet_use_effective_conv2d, + decompose_both=args.unet_decompose_both, + decompose_factor=args.unet_decompose_factor, + init_weights=True, + ) + else: + raise ValueError(f"Unknown adapter type {args.adapter}") + + return config + + +def create_text_encoder_adapter_config(args: argparse.Namespace) -> Union[LoraConfig, LoHaConfig, LoKrConfig]: + if args.adapter == "full": + raise ValueError("Cannot create text_encoder adapter config for full parameter") + + if args.adapter == "lora": + config = LoraConfig( + r=args.te_r, + lora_alpha=args.te_alpha, + target_modules=TEXT_ENCODER_TARGET_MODULES, + lora_dropout=args.te_dropout, + bias=args.te_bias, + init_lora_weights=True, + ) + elif args.adapter == "loha": + config = LoHaConfig( + r=args.te_r, + alpha=args.te_alpha, + target_modules=TEXT_ENCODER_TARGET_MODULES, + rank_dropout=args.te_rank_dropout, + module_dropout=args.te_module_dropout, + init_weights=True, + ) + elif args.adapter == "lokr": + config = LoKrConfig( + r=args.te_r, + alpha=args.te_alpha, + target_modules=TEXT_ENCODER_TARGET_MODULES, + rank_dropout=args.te_rank_dropout, + module_dropout=args.te_module_dropout, + decompose_both=args.te_decompose_both, + decompose_factor=args.te_decompose_factor, + init_weights=True, + ) + else: + raise ValueError(f"Unknown adapter type {args.adapter}") + + return config + + +def parse_args(input_args=None): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--revision", + type=str, + default=None, + required=False, + help="Revision of pretrained model identifier from huggingface.co/models.", + ) + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help="Pretrained tokenizer name or path if not the same as model_name", + ) + parser.add_argument( + "--instance_data_dir", + type=str, + default=None, + required=True, + help="A folder containing the training data of instance images.", + ) + parser.add_argument( + "--class_data_dir", + type=str, + default=None, + required=False, + help="A folder containing the training data of class images.", + ) + parser.add_argument( + "--instance_prompt", + type=str, + default=None, + required=True, + help="The prompt with identifier specifying the instance", + ) + parser.add_argument( + "--class_prompt", + type=str, + default=None, + help="The prompt to specify images in the same class as provided instance images.", + ) + parser.add_argument( + "--with_prior_preservation", + default=False, + action="store_true", + help="Flag to add prior preservation loss.", + ) + parser.add_argument("--prior_loss_weight", type=float, default=1.0, help="The weight of prior preservation loss.") + parser.add_argument( + "--num_class_images", + type=int, + default=100, + help=( + "Minimal class images for prior preservation loss. If there are not enough images already present in" + " class_data_dir, additional images will be sampled with class_prompt." + ), + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning.", + ) + parser.add_argument( + "--num_validation_images", + type=int, + default=4, + help="Number of images that should be generated during validation with `validation_prompt`.", + ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run dreambooth validation every X steps. Dreambooth validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="text-inversion-model", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") + parser.add_argument( + "--resolution", + type=int, + default=512, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution" + ) + parser.add_argument("--train_text_encoder", action="store_true", help="Whether to train the text encoder") + + parser.add_argument( + "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--sample_batch_size", type=int, default=4, help="Batch size (per device) for sampling images." + ) + parser.add_argument("--num_train_epochs", type=int, default=1) + parser.add_argument( + "--max_train_steps", + type=int, + default=None, + help="Total number of training steps to perform. If provided, overrides num_train_epochs.", + ) + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final" + " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--gradient_checkpointing", + action="store_true", + help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=5e-6, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--scale_lr", + action="store_true", + default=False, + help="Scale the learning rate by the number of accelerators, gradient accumulation steps, and batch size.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="constant", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) + parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.") + parser.add_argument( + "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." + ) + parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.") + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer") + parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument( + "--allow_tf32", + action="store_true", + help=( + "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see" + " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" + ), + ) + parser.add_argument( + "--report_to", + type=str, + default="tensorboard", + help=( + 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`' + ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' + ), + ) + parser.add_argument( + "--wandb_key", + type=str, + default=None, + help=("If report to option is set to wandb, api-key for wandb used for login to wandb "), + ) + parser.add_argument( + "--wandb_project_name", + type=str, + default=None, + help=("If report to option is set to wandb, project name in wandb for log tracking "), + ) + parser.add_argument( + "--mixed_precision", + type=str, + default=None, + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10. and an Nvidia Ampere GPU or Intel XPU. Default to the value of accelerate config of the current system or the" + " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config." + ), + ) + parser.add_argument( + "--prior_generation_precision", + type=str, + default=None, + choices=["no", "fp32", "fp16", "bf16"], + help=( + "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=" + " 1.10. and an Nvidia Ampere GPU or Intel XPU. Default to fp16 if an accelerator is available else fp32." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + # Adapter arguments + subparsers = parser.add_subparsers(dest="adapter") + + # Dummy subparser to train whole model + subparsers.add_parser("full", help="Train full model without adapters") + + # LoRA adapter + lora = subparsers.add_parser("lora", help="Use LoRA adapter") + lora.add_argument("--unet_r", type=int, default=8, help="LoRA rank for unet") + lora.add_argument("--unet_alpha", type=int, default=8, help="LoRA alpha for unet") + lora.add_argument("--unet_dropout", type=float, default=0.0, help="LoRA dropout probability for unet") + lora.add_argument( + "--unet_bias", + type=str, + default="none", + help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only'", + ) + lora.add_argument( + "--te_r", type=int, default=8, help="LoRA rank for text_encoder, only used if `train_text_encoder` is True" + ) + lora.add_argument( + "--te_alpha", + type=int, + default=8, + help="LoRA alpha for text_encoder, only used if `train_text_encoder` is True", + ) + lora.add_argument( + "--te_dropout", + type=float, + default=0.0, + help="LoRA dropout probability for text_encoder, only used if `train_text_encoder` is True", + ) + lora.add_argument( + "--te_bias", + type=str, + default="none", + help="Bias type for LoRA. Can be 'none', 'all' or 'lora_only', only used if `train_text_encoder` is True", + ) + + # LoHa adapter + loha = subparsers.add_parser("loha", help="Use LoHa adapter") + loha.add_argument("--unet_r", type=int, default=8, help="LoHa rank for unet") + loha.add_argument("--unet_alpha", type=int, default=8, help="LoHa alpha for unet") + loha.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoHa rank_dropout probability for unet") + loha.add_argument( + "--unet_module_dropout", type=float, default=0.0, help="LoHa module_dropout probability for unet" + ) + loha.add_argument( + "--unet_use_effective_conv2d", + action="store_true", + help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1", + ) + loha.add_argument( + "--te_r", type=int, default=8, help="LoHa rank for text_encoder, only used if `train_text_encoder` is True" + ) + loha.add_argument( + "--te_alpha", + type=int, + default=8, + help="LoHa alpha for text_encoder, only used if `train_text_encoder` is True", + ) + loha.add_argument( + "--te_rank_dropout", + type=float, + default=0.0, + help="LoHa rank_dropout probability for text_encoder, only used if `train_text_encoder` is True", + ) + loha.add_argument( + "--te_module_dropout", + type=float, + default=0.0, + help="LoHa module_dropout probability for text_encoder, only used if `train_text_encoder` is True", + ) + + # LoKr adapter + lokr = subparsers.add_parser("lokr", help="Use LoKr adapter") + lokr.add_argument("--unet_r", type=int, default=8, help="LoKr rank for unet") + lokr.add_argument("--unet_alpha", type=int, default=8, help="LoKr alpha for unet") + lokr.add_argument("--unet_rank_dropout", type=float, default=0.0, help="LoKr rank_dropout probability for unet") + lokr.add_argument( + "--unet_module_dropout", type=float, default=0.0, help="LoKr module_dropout probability for unet" + ) + lokr.add_argument( + "--unet_use_effective_conv2d", + action="store_true", + help="Use parameter effective decomposition in unet for Conv2d 3x3 with ksize > 1", + ) + lokr.add_argument( + "--unet_decompose_both", action="store_true", help="Decompose left matrix in kronecker product for unet" + ) + lokr.add_argument( + "--unet_decompose_factor", type=int, default=-1, help="Decompose factor in kronecker product for unet" + ) + lokr.add_argument( + "--te_r", type=int, default=8, help="LoKr rank for text_encoder, only used if `train_text_encoder` is True" + ) + lokr.add_argument( + "--te_alpha", + type=int, + default=8, + help="LoKr alpha for text_encoder, only used if `train_text_encoder` is True", + ) + lokr.add_argument( + "--te_rank_dropout", + type=float, + default=0.0, + help="LoKr rank_dropout probability for text_encoder, only used if `train_text_encoder` is True", + ) + lokr.add_argument( + "--te_module_dropout", + type=float, + default=0.0, + help="LoKr module_dropout probability for text_encoder, only used if `train_text_encoder` is True", + ) + lokr.add_argument( + "--te_decompose_both", + action="store_true", + help="Decompose left matrix in kronecker product for text_encoder, only used if `train_text_encoder` is True", + ) + lokr.add_argument( + "--te_decompose_factor", + type=int, + default=-1, + help="Decompose factor in kronecker product for text_encoder, only used if `train_text_encoder` is True", + ) + + if input_args is not None: + args = parser.parse_args(input_args) + else: + args = parser.parse_args() + + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.with_prior_preservation: + if args.class_data_dir is None: + raise ValueError("You must specify a data directory for class images.") + if args.class_prompt is None: + raise ValueError("You must specify prompt for class images.") + else: + # logger is not available yet + if args.class_data_dir is not None: + warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") + if args.class_prompt is not None: + warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + + return args + + +# Converting Bytes to Megabytes +def b2mb(x): + return int(x / 2**20) + + +# This context manager is used to track the peak memory usage of the process +class TorchTracemalloc: + def __enter__(self): + gc.collect() + self.device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + self.device_module = getattr(torch, self.device_type, torch.cuda) + self.device_module.empty_cache() + self.device_module.reset_peak_memory_stats() # reset the peak gauge to zero + self.begin = self.device_module.memory_allocated() + self.process = psutil.Process() + + self.cpu_begin = self.cpu_mem_used() + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + return self + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_peak = -1 + + while True: + self.cpu_peak = max(self.cpu_mem_used(), self.cpu_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def __exit__(self, *exc): + self.peak_monitoring = False + + gc.collect() + self.device_module.empty_cache() + self.end = self.device_module.memory_allocated() + self.peak = self.device_module.max_memory_allocated() + self.used = b2mb(self.end - self.begin) + self.peaked = b2mb(self.peak - self.begin) + + self.cpu_end = self.cpu_mem_used() + self.cpu_used = b2mb(self.cpu_end - self.cpu_begin) + self.cpu_peaked = b2mb(self.cpu_peak - self.cpu_begin) + # print(f"delta used/peak {self.used:4d}/{self.peaked:4d}") + + +class DreamBoothDataset(Dataset): + """ + A dataset to prepare the instance and class images with the prompts for fine-tuning the model. + It pre-processes the images and the tokenizes prompts. + """ + + def __init__( + self, + instance_data_root, + instance_prompt, + tokenizer, + class_data_root=None, + class_prompt=None, + size=512, + center_crop=False, + ): + self.size = size + self.center_crop = center_crop + self.tokenizer = tokenizer + + self.instance_data_root = Path(instance_data_root) + if not self.instance_data_root.exists(): + raise ValueError("Instance images root doesn't exists.") + + self.instance_images_path = list(Path(instance_data_root).iterdir()) + self.num_instance_images = len(self.instance_images_path) + self.instance_prompt = instance_prompt + self._length = self.num_instance_images + + if class_data_root is not None: + self.class_data_root = Path(class_data_root) + self.class_data_root.mkdir(parents=True, exist_ok=True) + self.class_images_path = list(self.class_data_root.iterdir()) + self.num_class_images = len(self.class_images_path) + self._length = max(self.num_class_images, self.num_instance_images) + self.class_prompt = class_prompt + else: + self.class_data_root = None + + self.image_transforms = transforms.Compose( + [ + transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def __len__(self): + return self._length + + def __getitem__(self, index): + example = {} + instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + example["instance_images"] = self.image_transforms(instance_image) + example["instance_prompt_ids"] = self.tokenizer( + self.instance_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + if self.class_data_root: + class_image = Image.open(self.class_images_path[index % self.num_class_images]) + if not class_image.mode == "RGB": + class_image = class_image.convert("RGB") + example["class_images"] = self.image_transforms(class_image) + example["class_prompt_ids"] = self.tokenizer( + self.class_prompt, + truncation=True, + padding="max_length", + max_length=self.tokenizer.model_max_length, + return_tensors="pt", + ).input_ids + + return example + + +def collate_fn(examples, with_prior_preservation=False): + input_ids = [example["instance_prompt_ids"] for example in examples] + pixel_values = [example["instance_images"] for example in examples] + + # Concat class and instance examples for prior preservation. + # We do this to avoid doing two forward passes. + if with_prior_preservation: + input_ids += [example["class_prompt_ids"] for example in examples] + pixel_values += [example["class_images"] for example in examples] + + pixel_values = torch.stack(pixel_values) + pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float() + + input_ids = torch.cat(input_ids, dim=0) + + batch = { + "input_ids": input_ids, + "pixel_values": pixel_values, + } + return batch + + +class PromptDataset(Dataset): + "A simple dataset to prepare the prompts to generate class images on multiple accelerators." + + def __init__(self, prompt, num_samples): + self.prompt = prompt + self.num_samples = num_samples + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + example = {} + example["prompt"] = self.prompt + example["index"] = index + return example + + +def main(args): + logging_dir = Path(args.output_dir, args.logging_dir) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.report_to, + project_dir=logging_dir, + ) + if args.report_to == "wandb": + import wandb + + wandb.login(key=args.wandb_key) + wandb.init(project=args.wandb_project_name) + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate + # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. + # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. + if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1: + raise ValueError( + "Gradient accumulation is not supported when training the text encoder in distributed training. " + "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." + ) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + transformers.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + transformers.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # If passed along, set the training seed now. + if args.seed is not None: + set_seed(args.seed) + + # Generate class images if prior preservation is enabled. + if args.with_prior_preservation: + class_images_dir = Path(args.class_data_dir) + if not class_images_dir.exists(): + class_images_dir.mkdir(parents=True) + cur_class_images = len(list(class_images_dir.iterdir())) + + if cur_class_images < args.num_class_images: + torch_dtype = torch.float16 if accelerator.device.type in ["cuda", "xpu"] else torch.float32 + if args.prior_generation_precision == "fp32": + torch_dtype = torch.float32 + elif args.prior_generation_precision == "fp16": + torch_dtype = torch.float16 + elif args.prior_generation_precision == "bf16": + torch_dtype = torch.bfloat16 + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + torch_dtype=torch_dtype, + safety_checker=None, + revision=args.revision, + ) + pipeline.set_progress_bar_config(disable=True) + + num_new_images = args.num_class_images - cur_class_images + logger.info(f"Number of class images to sample: {num_new_images}.") + + sample_dataset = PromptDataset(args.class_prompt, num_new_images) + sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size) + + sample_dataloader = accelerator.prepare(sample_dataloader) + pipeline.to(accelerator.device) + + for example in tqdm( + sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process + ): + images = pipeline(example["prompt"]).images + + for i, image in enumerate(images): + hash_image = hashlib.sha1(image.tobytes()).hexdigest() + image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + image.save(image_filename) + + del pipeline + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + api = HfApi(token=args.hub_token) + + # Create repo (repo_name from args or inferred) + repo_name = args.hub_model_id + if repo_name is None: + repo_name = Path(args.output_dir).absolute().name + repo_id = api.create_repo(repo_name, exist_ok=True).repo_id + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Load the tokenizer + if args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) + elif args.pretrained_model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained( + args.pretrained_model_name_or_path, + subfolder="tokenizer", + revision=args.revision, + use_fast=False, + ) + + # import correct text encoder class + text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision) + + # Load scheduler and models + noise_scheduler = DDPMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + num_train_timesteps=1000, + ) # DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler") + text_encoder = text_encoder_cls.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + unet = UNet2DConditionModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision + ) + + if args.adapter != "full": + config = create_unet_adapter_config(args) + unet = get_peft_model(unet, config) + unet.print_trainable_parameters() + print(unet) + + vae.requires_grad_(False) + if not args.train_text_encoder: + text_encoder.requires_grad_(False) + elif args.train_text_encoder and args.adapter != "full": + config = create_text_encoder_adapter_config(args) + text_encoder = get_peft_model(text_encoder, config) + text_encoder.print_trainable_parameters() + print(text_encoder) + + if args.enable_xformers_memory_efficient_attention: + if accelerator.device.type == "xpu": + logger.warn("XPU hasn't support xformers yet, ignore it.") + elif is_xformers_available(): + unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + if args.gradient_checkpointing: + unet.enable_gradient_checkpointing() + if args.train_text_encoder and not args.adapter != "full": + text_encoder.gradient_checkpointing_enable() + + # Enable TF32 for faster training on Ampere GPUs, + # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices + if args.allow_tf32 and torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + + if args.scale_lr: + args.learning_rate = ( + args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes + ) + + # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB accelerators + if args.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError( + "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`." + ) + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + # Optimizer creation + params_to_optimize = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) if args.train_text_encoder else unet.parameters() + ) + optimizer = optimizer_class( + params_to_optimize, + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Dataset and DataLoaders creation: + train_dataset = DreamBoothDataset( + instance_data_root=args.instance_data_dir, + instance_prompt=args.instance_prompt, + class_data_root=args.class_data_dir if args.with_prior_preservation else None, + class_prompt=args.class_prompt, + tokenizer=tokenizer, + size=args.resolution, + center_crop=args.center_crop, + ) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=args.train_batch_size, + shuffle=True, + collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation), + num_workers=1, + ) + + # Scheduler and math around the number of training steps. + overrode_max_train_steps = False + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if args.max_train_steps is None: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + overrode_max_train_steps = True + + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles, + power=args.lr_power, + ) + + # Prepare everything with our `accelerator`. + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, optimizer, train_dataloader, lr_scheduler + ) + + # For mixed precision training we cast the text_encoder and vae weights to half-precision + # as these models are only used for inference, keeping weights in full precision is not required. + weight_dtype = torch.float32 + if accelerator.mixed_precision == "fp16": + weight_dtype = torch.float16 + elif accelerator.mixed_precision == "bf16": + weight_dtype = torch.bfloat16 + + # Move vae and text_encoder to device and cast to weight_dtype + vae.to(accelerator.device, dtype=weight_dtype) + if not args.train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) + + # We need to recalculate our total training steps as the size of the training dataloader may have changed. + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + accelerator.init_trackers("dreambooth", config=vars(args)) + + # Train! + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataset)}") + logger.info(f" Num batches each epoch = {len(train_dataloader)}") + logger.info(f" Num Epochs = {args.num_train_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {args.max_train_steps}") + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the mos recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = resume_global_step // num_update_steps_per_epoch + resume_step = resume_global_step % num_update_steps_per_epoch + + # Only show the progress bar once on each machine. + progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + for epoch in range(first_epoch, args.num_train_epochs): + unet.train() + if args.train_text_encoder: + text_encoder.train() + with TorchTracemalloc() as tracemalloc: + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + continue + + with accelerator.accumulate(unet): + # Convert images to latent space + latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample() + latents = latents * 0.18215 + + # Sample noise that we'll add to the latents + noise = torch.randn_like(latents) + bsz = latents.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + timesteps = timesteps.long() + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + + # Get the text embedding for conditioning + encoder_hidden_states = text_encoder(batch["input_ids"])[0] + + # Predict the noise residual + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + # Get the target for loss depending on the prediction type + if noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif noise_scheduler.config.prediction_type == "v_prediction": + target = noise_scheduler.get_velocity(latents, noise, timesteps) + else: + raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") + + if args.with_prior_preservation: + # Chunk the noise and model_pred into two parts and compute the loss on each part separately. + model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0) + target, target_prior = torch.chunk(target, 2, dim=0) + + # Compute instance loss + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + # Compute prior loss + prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean") + + # Add the prior loss to the instance loss. + loss = loss + args.prior_loss_weight * prior_loss + else: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = ( + itertools.chain(unet.parameters(), text_encoder.parameters()) + if args.train_text_encoder + else unet.parameters() + ) + accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + if args.report_to == "wandb": + accelerator.print(progress_bar) + global_step += 1 + + # if global_step % args.checkpointing_steps == 0: + # if accelerator.is_main_process: + # save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + # accelerator.save_state(save_path) + # logger.info(f"Saved state to {save_path}") + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + if ( + args.validation_prompt is not None + and (step + num_update_steps_per_epoch * epoch) % args.validation_steps == 0 + ): + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + # create pipeline + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + safety_checker=None, + revision=args.revision, + ) + # set `keep_fp32_wrapper` to True because we do not want to remove + # mixed precision hooks while we are still training + pipeline.unet = accelerator.unwrap_model(unet, keep_fp32_wrapper=True) + pipeline.text_encoder = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # Set evaliation mode + pipeline.unet.eval() + pipeline.text_encoder.eval() + + # run inference + if args.seed is not None: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + else: + generator = None + images = [] + for _ in range(args.num_validation_images): + image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + import wandb + + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + # Set evaliation mode + pipeline.unet.train() + pipeline.text_encoder.train() + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + if global_step >= args.max_train_steps: + break + # Printing the accelerator memory usage details such as allocated memory, peak memory, and total memory usage + accelerator.print( + f"{accelerator.device.type.upper()} Memory before entering the train : {b2mb(tracemalloc.begin)}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Memory consumed at the end of the train (end-begin): {tracemalloc.used}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Peak Memory consumed during the train (max-begin): {tracemalloc.peaked}" + ) + accelerator.print( + f"{accelerator.device.type.upper()} Total Peak Memory consumed during the train (max): {tracemalloc.peaked + b2mb(tracemalloc.begin)}" + ) + + accelerator.print(f"CPU Memory before entering the train : {b2mb(tracemalloc.cpu_begin)}") + accelerator.print(f"CPU Memory consumed at the end of the train (end-begin): {tracemalloc.cpu_used}") + accelerator.print(f"CPU Peak Memory consumed during the train (max-begin): {tracemalloc.cpu_peaked}") + accelerator.print( + f"CPU Total Peak Memory consumed during the train (max): {tracemalloc.cpu_peaked + b2mb(tracemalloc.cpu_begin)}" + ) + + # Create the pipeline using using the trained modules and save it. + accelerator.wait_for_everyone() + if accelerator.is_main_process: + if args.adapter != "full": + unwarpped_unet = accelerator.unwrap_model(unet) + unwarpped_unet.save_pretrained( + os.path.join(args.output_dir, "unet"), state_dict=accelerator.get_state_dict(unet) + ) + if args.train_text_encoder: + unwarpped_text_encoder = accelerator.unwrap_model(text_encoder) + unwarpped_text_encoder.save_pretrained( + os.path.join(args.output_dir, "text_encoder"), + state_dict=accelerator.get_state_dict(text_encoder), + ) + else: + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), + revision=args.revision, + ) + pipeline.save_pretrained(args.output_dir) + + if args.push_to_hub: + api.upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + run_as_future=True, + ) + + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/peft/examples/token_classification/peft_lora_ner.ipynb b/peft/examples/token_classification/peft_lora_ner.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..e2a6c048daef2200a4c08f55d1c33ba2732a5ee2 --- /dev/null +++ b/peft/examples/token_classification/peft_lora_ner.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Named Entity Recognition with Peft Model 🤗\n", + "\n", + "##### In this notebook, we will learn how to perform Named Entity Recognition(NER) on the CoNLL-2003 dataset using the Trainer class\n", + "\n", + "##### This notebook has been adapted from the main NLP course here - https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#fine-tuning-the-model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#install the required libraries\n", + "!pip install -q datasets evaluate transformers seqeval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import required libraries\n", + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, pipeline\n", + "from peft import get_peft_model, LoraConfig, TaskType\n", + "import evaluate\n", + "import numpy as np\n", + "from huggingface_hub import notebook_login" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 14041\n", + " })\n", + " validation: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3250\n", + " })\n", + " test: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3453\n", + " })\n", + "})\n" + ] + } + ], + "source": [ + "raw_datasets = load_dataset(\"conll2003\")\n", + "print(raw_datasets)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at the tokens of the first training example\n", + "raw_datasets[\"train\"][0][\"tokens\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[3, 0, 7, 0, 0, 0, 7, 0, 0]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at the NER tags of the first training example\n", + "raw_datasets[\"train\"][0][\"ner_tags\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Get the label names for the NER tags\n", + "ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n", + "label_names = ner_feature.feature.names\n", + "label_names" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EU rejects German call to boycott British lamb . \n", + "B-ORG O B-MISC O O O B-MISC O O \n" + ] + } + ], + "source": [ + "words = raw_datasets[\"train\"][0][\"tokens\"]\n", + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "line1 = \"\"\n", + "line2 = \"\"\n", + "for word, label in zip(words, labels):\n", + " full_label = label_names[label]\n", + " max_length = max(len(word), len(full_label))\n", + " line1 += word + \" \" * (max_length - len(word) + 1)\n", + " line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n", + "\n", + "print(line1)\n", + "print(line2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the tokenizer\n", + "model_checkpoint = \"bert-base-cased\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['[CLS]',\n", + " 'EU',\n", + " 'rejects',\n", + " 'German',\n", + " 'call',\n", + " 'to',\n", + " 'boycott',\n", + " 'British',\n", + " 'la',\n", + " '##mb',\n", + " '.',\n", + " '[SEP]']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Tokenize the first training example\n", + "inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n", + "inputs.tokens()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def align_labels_with_tokens(labels, word_ids):\n", + " new_labels = []\n", + " current_word = None\n", + " for word_id in word_ids:\n", + " if word_id != current_word:\n", + " # Start of a new word!\n", + " current_word = word_id\n", + " label = -100 if word_id is None else labels[word_id]\n", + " new_labels.append(label)\n", + " elif word_id is None:\n", + " # Special token\n", + " new_labels.append(-100)\n", + " else:\n", + " # Same word as previous token\n", + " label = labels[word_id]\n", + " # If the label is B-XXX we change it to I-XXX\n", + " if label % 2 == 1:\n", + " label += 1\n", + " new_labels.append(label)\n", + "\n", + " return new_labels" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3, 0, 7, 0, 0, 0, 7, 0, 0]\n", + "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n" + ] + } + ], + "source": [ + "labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n", + "word_ids = inputs.word_ids()\n", + "print(labels)\n", + "print(align_labels_with_tokens(labels, word_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(\n", + " examples[\"tokens\"], truncation=True, is_split_into_words=True\n", + " )\n", + " all_labels = examples[\"ner_tags\"]\n", + " new_labels = []\n", + " for i, labels in enumerate(all_labels):\n", + " word_ids = tokenized_inputs.word_ids(i)\n", + " new_labels.append(align_labels_with_tokens(labels, word_ids))\n", + "\n", + " tokenized_inputs[\"labels\"] = new_labels\n", + " return tokenized_inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_datasets = raw_datasets.map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=raw_datasets[\"train\"].column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]\n", + "[-100, 1, 2, -100]\n" + ] + } + ], + "source": [ + "for i in range(2):\n", + " print(tokenized_datasets[\"train\"][i][\"labels\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "metric = evaluate.load(\"seqeval\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Create label mappings\n", + "id2label = {i: label for i, label in enumerate(label_names)}\n", + "label2id = {v: k for k, v in id2label.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the pre-trained model\n", + "model = AutoModelForTokenClassification.from_pretrained(\n", + " model_checkpoint,\n", + " id2label=id2label,\n", + " label2id=label2id,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.config.num_labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure LoRA (Low-Rank Adaptation) for fine-tuning\n", + "peft_config = LoraConfig(target_modules = [\"query\", \"key\"], task_type = TaskType.TOKEN_CLS)\n", + "\n", + "model = get_peft_model(model, peft_config)\n", + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_metrics(eval_preds):\n", + " logits, labels = eval_preds\n", + " predictions = np.argmax(logits, axis=-1)\n", + "\n", + " # Remove ignored index (special tokens) and convert to labels\n", + " true_labels = [[label_names[l] for l in label if l != -100] for label in labels]\n", + " true_predictions = [\n", + " [label_names[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " all_metrics = metric.compute(predictions=true_predictions, references=true_labels)\n", + " return {\n", + " \"precision\": all_metrics[\"overall_precision\"],\n", + " \"recall\": all_metrics[\"overall_recall\"],\n", + " \"f1\": all_metrics[\"overall_f1\"],\n", + " \"accuracy\": all_metrics[\"overall_accuracy\"],\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "notebook_login()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "args = TrainingArguments(\n", + " \"bert-finetuned-ner-lora\",\n", + " eval_strategy=\"epoch\",\n", + " per_device_train_batch_size=32, # decrease this for OOM error\n", + " per_device_eval_batch_size=64,\n", + " save_strategy=\"epoch\",\n", + " learning_rate=2e-3,\n", + " num_train_epochs=5,\n", + " weight_decay=0.01,\n", + " load_best_model_at_end=True,\n", + " do_eval=True,\n", + " do_predict=True,\n", + " metric_for_best_model=\"accuracy\",\n", + " label_names=[\"labels\"],\n", + " push_to_hub=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=args,\n", + " train_dataset=tokenized_datasets[\"train\"],\n", + " eval_dataset=tokenized_datasets[\"validation\"],\n", + " data_collator=data_collator,\n", + " processing_class=tokenizer,\n", + " compute_metrics=compute_metrics\n", + ")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "Device set to use xpu:0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", + "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", + "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", + "entity_idx: 1, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", + "entity_idx: 2, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n", + "entity_idx: 0, id2label: {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'entity_group': 'PER',\n", + " 'score': 0.9702984,\n", + " 'word': 'Jino',\n", + " 'start': 11,\n", + " 'end': 15}]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peft import PeftModel\n", + "\n", + "# Replace this with your own checkpoint\n", + "lora_checkpoint = \"./bert-finetuned-ner-lora\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n", + "base_model = AutoModelForTokenClassification.from_pretrained(\n", + " model_checkpoint,\n", + " id2label=id2label,\n", + " label2id=label2id,\n", + ")\n", + "lora_model = PeftModel.from_pretrained(base_model, lora_checkpoint)\n", + "token_classifier = pipeline(\n", + " \"token-classification\", model=lora_model, tokenizer=tokenizer, aggregation_strategy=\"simple\"\n", + ")\n", + "\n", + "token_classifier(\"My name is Jino.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/peft/examples/token_classification/peft_lora_token_cls.ipynb b/peft/examples/token_classification/peft_lora_token_cls.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0c6c686ae5b40f04fa4edb8aa5a31ac4ef253942 --- /dev/null +++ b/peft/examples/token_classification/peft_lora_token_cls.ipynb @@ -0,0 +1,1430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ngqdEv0rP01q" + }, + "source": [ + "## Introduction\n", + "\n", + "In this notebook, we are going to fine-tune the LayoutLM model by Microsoft Research on the [FUNSD](https://guillaumejaume.github.io/FUNSD/) dataset, which is a collection of annotated form documents. The goal of our model is to learn the annotations of a number of labels (\"question\", \"answer\", \"header\" and \"other\") on those forms, such that it can be used to annotate unseen forms in the future.\n", + "\n", + "* Original LayoutLM paper: https://huggingface.co/papers/1912.13318\n", + "\n", + "* Original FUNSD paper: https://huggingface.co/papers/1905.13538\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6K4S2s33ebY0" + }, + "source": [ + "## Install libraries\n", + "\n", + "Currently you have to first install the `unilm` package, and then the `transformers` package (which updates the outdated `transformers` package that is included in the `unilm` package). The reason we also install the `unilm` package is because we need its preprocessing files. I've forked it, and removed some statements which introduced some issues." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "5cngOTr6SqEf", + "outputId": "6c7a2f76-682b-4f93-a3db-59ab010e5ffe" + }, + "outputs": [], + "source": [ + "! rm -r unilm\n", + "! git clone https://github.com/microsoft/unilm.git" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RGMkEG5aRB0D" + }, + "source": [ + "## Getting the data\n", + "\n", + "Here we download the data of the [FUNSD dataset](https://guillaumejaume.github.io/FUNSD/) from the web. This results in a directory called \"data\" being created, which has 2 subdirectories, one for training and one for testing. Each of those has 2 subdirectories in turn, one containing the images as png files and one containing the annotations in json format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DTFnEZagQm4v", + "outputId": "97ce03ba-a6bb-4444-8eba-77eceece44e0" + }, + "outputs": [], + "source": [ + "! wget https://guillaumejaume.github.io/FUNSD/dataset.zip\n", + "! unzip dataset.zip && mv dataset data && rm -rf dataset.zip __MACOSX" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UrNMR64LsJXm" + }, + "source": [ + "Let's take a look at a training example. For this, we are going to use PIL (Python Image Library)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "eG-eGcj3sNPs", + "outputId": "69ead0ea-15d6-4d5e-af61-a99a7533d31b" + }, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAvoAAAPoCAIAAAAgFM2mAAEAAElEQVR4AezdeaB91fw+cA2SIWODsRBCipSkJJmJQgNFmTIlVMgQZShziQyJZMxchkSGMoRKRBoQSUSSKaGMvxfvb8v+nXPv+dx7P+feu++9z/nj3nXWXnvttZ61z34/63m/19pXu1o+QSAIBIEgEASCQBAYKwKf+9zn/t2nz4pj7V0qCwJBIAgEgSAQBIJA7xAI3endkKRBQSAIBIEgEASCwHgRWPl73/veeGtMbdNFYKWVVtp7773vf//777DDDu94xztWXXXV9ddf/8c//vE73/nOpz3taZ///OcvueSSFVf8HzG98sort9hiixe96EUf+9jH/va3v62yyiprr732+eeff9xxxz31qU/dZJNNZLY2qNzpe+2117rrrnvQQQf9/e9/r0PXuMY1jj32WH+PPPLILbfc8upXv/qJJ57oax1Vw1ZbbSX9la98hRi5yy67fOtb3/rpT3+qtlZzEkEgCASBILAIEPj+97/fTMMi6M5kXViBMZvsWPKDQBAIAkEgCASBRYwADmDSe9lll3Un1TPur5nzueee+5a3vEUNYnce+MAHzriqsZ+48thrTIVBIAgEgSAQBILAgkBghRVW2H333cfY1C9/+ctFd8ZY51iq+p+LZCzVpZIgEASCQBAIAkEgCPQNgdCdvo1I2hMEgkAQCAJBIAiMGYHQnTEDmuqCQBAIAkEgCASBviEQutO3EUl7gkAQCAJBIAgEgTEjELozZkBTXRAIAkEgCASBINA3BEJ3+jYiaU8QCAJBIAgEgSAwZgRCd8YMaKoLAkEgCASBIBAE+oZA6E7fRiTtCQJBIAgEgSAQBMaMQOjOmAFNdUEgCASBIBAEgkDfEAjd6duIpD1BIAgEgSAQBILAmBEI3RkzoKkuCASBIBAEgkAQ6BsCoTt9G5G0JwgEgSAQBIJAEBgzAqE7YwY01QWBIBAEgkAQCAJ9QyB0p28jkvYEgSAQBIJAEAgCY0YgdGfMgKa6IBAEgkAQCAJBoG8IhO70bUTSniAQBIJAEAgCQWDMCITujBnQVBcEgkAQCAJBIAj0DYHQnb6NSNoTBIJAEAgCQSAIjBmB0J0xA5rqgkAQCAJBIAgEgb4hELrTtxFJe4JAEAgCQSAIBIExIxC6M2ZAU10QCAJBIAgEgSDQNwRCd/o2ImlPEAgCQSAIBIEgMGYEQnfGDGiqCwJBIAgEgSAQBPqGQOhO30Yk7QkCQSAIBIEgEATGjEDozpgBTXVBIAgEgSAQBIJA3xAI3enbiKQ9QSAIBIEgEASCwJgRCN0ZM6CpLggEgSAQBIJAEOgbAqE7fRuRtCcIBIEgEASCQBAYMwKhO2MGNNUFgSAQBIJAEAgCfUMgdKdvI5L2BIEgEASCQBAIAmNGIHRnzICmuiAQBIJAEAgCQaBvCITu9G1E0p4gEASCQBAIAkFgzAiE7owZ0FQXBIJAEAgCQSAI9A2B0J2+jUjaEwSCQBAIAkEgCIwZgdCdMQOa6oJAEAgCQSAIBIG+IRC607cRSXuCQBAIAkEgCASBMSMQujNmQFNdEAgCQSAIBIEg0DcEQnf6NiJpTxAIAkEgCASBIDBmBEJ3xgxoqgsCQSAIBIEgEAT6hkDoTt9GJO0JAkEgCASBIBAExoxA6M6YAU11QSAIBIEgEASCQN8QCN3p24ikPUEgCASBIBAEgsCYEQjdGTOgqS4IBIEgEASCQBDoGwKhO30bkbQnCASBIBAEgkAQGDMCoTtjBjTVBYEgEASCQBAIAn1DIHSnbyOS9gSBIBAEgkAQCAJjRiB0Z8yAprogEASCQBAIAksWgTPPPLOffQ/d6ee4pFVBIAgEgSAQBBYeAoceemg/Gx26089xSauCQBAIAkEgCASBsSEQujM2KFNREAgCQSAIBIEg0E8EQnf6OS5pVRAIAkEgCASBIDA2BEJ3xgZlKgoCQSAIBIEgEAT6iUDoTj/HJa0KAkEgCASBILDwENhtt9362ejQnX6OS1oVBIJAEAgCQWDhIXDve9+7n40O3ennuKRVQSAIBIEgEAQWHgKHHXZYPxsdutPPcUmrgkAQCAJBIAgsPATOOOOMfjY6dKef45JWBYEgEASCQBBYeAjst99+/Wx06E4/xyWtCgJBIAgEgSCw8BBYd911+9no0J1+jktaFQSCQBAIAkFg4SGw995797PRoTv9HJe0KggEgSAQBILAwkNg22237WejQ3f6OS5pVRAIAkEgCASBhYfA1ltv3c9Gh+70c1zSqiAQBIJAEAgCQWBsCITujA3KVBQEgkAQCAJBIAj0E4HQnX6OS1oVBIJAEAgCQSAIjA2B0J2xQZmKgkAQCAJBIAgEgX4iELrTz3FJq4JAEAgCQSAIBIGxIRC6MzYoU1EQCAJBIAgEgSDQTwRCd/o5LmlVEAgCQSAIBIEgMDYEQnfGBmUqCgJBIAgEgSAQBPqJQOhOP8clrQoCQSAIBIEgEATGhkDoztigTEVBIAgEgSAQBIJAPxEI3ennuKRVQSAIBIEgEASCwNgQCN0ZG5SpKAgEgSAQBIJAEOgnAqE7/RyXtCoIBIEgEASCQBAYGwKhO2ODMhUFgSAQBIJAEAgC/UQgdKef45JWBYEgEASCQBAIAmNDIHRnbFCmoiAQBIJAEAgCQaCfCITu9HNc0qogEASCQBAIAkFgbAiE7owNylQUBIJAEAgCQSAI9BOB0J1+jktaFQSCQBAIAkEgCIwNgdCdsUGZioJAEAgCQSAILHEE3vjGN/YTgdCdfo5LWhUEgkAQCAJBYOEhcJ3rXKefjQ7d6ee4pFVBIAgEgSAQBBYeApdffnk/Gx26089xSauCQBAIAkEgCCw8BL73ve/1s9Er97NZaVUQCALLj8BZZ531l7/8ZYUVVhiu6t///vfVr371u9zlLhMeHS6fnCAQBILAVBB4zGMec+CBB06l5ByXCd2ZY8BzuSAwKwj885//XGmllfCbN73pTdKuseKKK37sYx/7wx/+MNn1Vl111Uc96lHXvva1MZ4Xv/jFa621VqjPZFglPwgEgSkiYCo1xZJzXGyF3rZsjoHI5YLAgkPgH//4x5e+9CV/EZ399tvvRz/6kfTf//73zTffXI7uXPOa10R6JuuX3/4VV1xx+umnX3bZZUjP9a9//de//vXXu971VLLlllv6OtmJyQ8CQSAITIbAl7/85a233trRz33ucw984AMnKzb3+VF35h7zXDEILC8CGMmpp55KMf7iF78ojdysvPLKu+2220Me8hD8Zptttim6M5XLnHTSSRSgc889V21qwJachTDtueee22+//SqrrDKVSlImCASBINBzBKLu9HyA0rwg8P8hwF2F6HzkIx/5/Oc/L/hml112IdI8+MEP5pbiw0J6/r/SU/7yr3/9S9nTTjvtrW9967Wuda1jjjnmN7/5zR3ucIfnPve5t7zlLe9zn/tMuaYUDAJBYEkj0Ft1J3RnSd+X6fwCQuBvf/vb3nvv/clPfvKXv/zl2muvfeihh970pjfdeOONh4WcP/3pT7/61a/IPKJz3vnOdx5//PHXuMY1Wk85sJ7ylKc86EEPuvLKK1Elh9TWjlbiBz/4wdve9rYPfOADv/3tb294wxve8573VM8aa6wxUCxfg0AQCAIDCBCJ3/e+98nsmzMrdGdgpPI1CPQRgc9+9rNvectbPvOZz2icsOJXvOIVw62k93zta1+zxxf559hjjx0uMGHO6quvzm9Fwnnc4x43UOCiiy56wAMecM4558jfdNNNtQH1GSiTr0EgCASBLgI87Pe///3lhO50YUk6CASBUQj89a9//e53v7vvvvvayuImN7nJK1/5yo022oioQ7ZxGu/V97///Ze//OXKyLn44ot///vfj6pu8mOCmtdZZx0V3uAGNxCwfNe73lXwsuLqFMj8xz/+caeddhLWI5PMI5B58ppyJAgEgSWNQG+dWTP09C/pwUzng8BcIbD//vsjH672pCc9Cc/oXlYQz2tf+9oPfehDFVzcPTSDNF7FgVUn3ute93roQx/6jGc8g8Prxv/9yOcRE8Fj8Zf4ZSLT3e52txlcJacEgSCw6BGw11c/+zjpItV+NjetCgJLBAGCzR577MGBtd1225199tkiaarjP/nJT7AN3nGEg4N8LFxnGNLjjjvOdak4HGR1CWHLLn344Yff/OY35+E6+uijL7300uETkxMEgsASR+ATn/hEPxFI7E4/xyWtWtII2E2HiMKFdOSRRz7xiU8sLIQVH3DAAR/+8IdJLFNEx0It0TZOFJVs9dYll1yCRfk6Yu/BgZrFQd/73vd23ebAEge94447nnDCCRxee+2116677jpwSr4GgSCwlBGwEuId73gHBBK7s5Rvg/Q9CCwbAYF+ts+xmw7xxnbs7YRnPvOZb37zm9vX0YnrXve6om1ufetbv+AFLyh5BnFBdCzX+vGPf8wLZrX5pz/96QsuuGB0PXVUODPG8+hHP1pCDvL0yEc+stxbVor19gXIU+laygSBIDBeBMyF+MHV2Te6czVPwHyCQBDoCQK2z8EedthhBzHI9sKpViEWXm41lT11RBwTXU488cQzzjhjmT0SrGOjnfvd735TfNjd9ra35cOqai1Q/9SnPmULZszMC5CXea0UCAJBYIkgIKCwHinoTq+6HLrTq+FIY5Y0AmiKnQMf8YhHkHYaEF/4whcoMVNkJEJ82olTTFh4JTZ5ivVvtdVWNuNpNVOJnGifeJHOLTOJIBAEljICt7rVrfpJdxKqPMXnfIoFgdlFQFCw8Bpc54Mf/GDtHCjUxjxJEM9UVjoIJcZ1nva0p023lauttprA5CnuqfOVr3xFsI4dDusqT3/6023rTCJ6+MMfLqZnupdO+SAQBILAnCEQujNnUOdCQWBSBE4++WTxv2Ji7KxTOyCLU0Z0dt55Z+rLpKd1DnitxAiuI3zHZsrqb2dIy6mwHoyHr93r06cShcPFZnPniiJSntiz++6789YL4mmVJxEEgkAQ6BsCoTt9G5G0ZykiICoZrXnve9+77rrrVv+tbsCBpojFs571rNvc5jYf/ehHP/7fD+rjjVd1LlFd7DPdyBvO73SnO9XmOt5HseGGG9pREMdCUxAd8TdeYkzmkTmVi4qbfsMb3lAl7XOo/qc+9anf+MY3pnJuygSBIBAE5gGBpexiTN+DQB8QeM5znuOX/653vas1hlgiCniKjwOe8vPPP9+2y93yQptRKBWW44l37EY3upECL3nJS2SSduoFWEKFavfke9zjHkiPQ5xT3XpGpC3yetOb3lRtxtWEPGuzLYIqJ3+DQBBYmggkdmfEYzOHgsDSRcD6poMPPpiW84QnPKFQEMRjiz/OrCmCYus/zxfRzVXeNjnWZ9lfxy7MfFVrrrmmfF4nUcbDFT75yU+mA3nP6De/+U3vzMJaNGYqS8BUxatFVbKCTJo7zIZAJCK+reGrJCcIBIGlg8B973vffnY2zqx+jktatSQQ8I6ql770pRxYz3/+86vDlm5yMImqmWL/H//4x+M3rbCAZWvFa2dCr9ZCXKxs9x4I5OmYY46RI7i4FZZAWXAjPix7NP/5z3/25lGFheZ0y4xOe59XBSnbz/CQQw4Re2Qn6NGn5GgQCAKLGIHubmG96mboTq+GI41ZWgiIm7FBzute9zr7AVbPxcRMMTZZeZoKd9Iqq6zSUDv33HO9XYtfjItKMM0KK6xgL0F75FQBC9q9Z5TTCstpp0igOJiKxE9/+lN/11577e7R0Wn8zNbPVYbYs/nmmx9xxBEzflPp6GvlaBAIAv1HwBOmn40M3ennuKRVix8B/ibbCZJbuK70FgV59rOfbcvj5ey5JeV2weHD4r3yGlHqDpfWne985zve8Y5CfG55y1t6FdfXv/71Wv9V17rwwgtL9VlvvfXk8HBtsMEGU2+GUOWvfe1rVV788plnnmkDoamfnpJBIAgsJgQ222yzfnYndKef45JWLX4EDjvsMJGML3vZy8T86i1/Fqlm6t2m3FSU8cAp3Fv8Sj//+c+PPfZYAT21+PxVr3rVfvvth1H97Gc/4zvjXK98Dfjd737n/eeIkUDj5z73uWrDhKzP4ghziYHKJ/xKy/FOie985zuObrTRRvbgsX7+y1/+8oSFkxkEgsDiRsBbhPvZwdCdfo5LWrXIEbAgnJupiI6u0lcsIZ9iny0pJwvtv//+1JrhnXJsVEhiQXRwDn6lqtMWPpxclT7llFNEJSM6vnJ7cV1VuLHAnbYK3UvXhTDTgXAXq7eW2TBvRz/vvPMUU3jTTTelJ1kXtsyzUiAIBIEgMGcIhO7MGdS5UBD4HwL2yMEnROrUgnBvxaq4mf+VmChlcx0lcRGni3EWZdwEGJJMrajyF7nhyVp11VW/9a1veQcFN5Y3hpKC6ElEF7vvCOIRtiyOx9WF/jzjGc+wNw+e1K6JRSFV9hLkhrdSzEbPt7jFLUbzHtJUbVooUHH99ddHpC6++OJWYRJBIAgEgXlGwCQvnyAQBOYSAS+FsEsN/nHqqae6LiVGhM3oBwGqQZ4RiTyinZaCv/GNb6wC/FaWffn46q+vPnWovv7n2FWfdmhE5Q615WMTNhXNoldVDQQkZWrjn9F15mgQCAKLDIGTTjqpHhF9e0Vo1J0JH93JDAKziMBFF11ETfEucX4flxHbO3otg1hmuwi+/e1vv/3tbz+iWTvttJO1UVWA6sNTVs4yfwktxCRsyVFf29H/lPjv1xHVtkMCgJ73vOe1rwMJpI3vDHOST0zCz7zDy/bNA8XyNQgEgcWNwC9+8Yt+djB0p5/jklYtZgS85EH3LGiqTopWprNM1mFvs+JLsqFOvTd0smIj8kXVeGn5Nttsg2CNKLbMQ0iSV6DXHtATFqbufPWrX3VIa+udEnImLJnMIBAEFisCwgr72bXQnX6OS1q1aBEQ1fue97zHVjckEJ0UWzPi5ZqCdWwPKEZnxnCQW4Qbe7eD615wwQXCdEZQq6lc5fWvf/0LXvCCyUo2+ed2t7udMlT6yUomPwgEgSAwlwiE7swl2rlWELjaJz7xiR/96Ef2thEsDA5xPCNA2WuvvZaH61i6JSrIXj6uVarMW9/61sZIRlx39CFereY1GyjJZVYOLBv5WA/f3FsDxfI1CASBxYqAV+L0s2uhO/0cl7RqqSAgWnmyrnpqEGMmOzqVfPspiwrCPwTotPL247H7Tvs6s4Q3iVq6NXyumpGhyud9O+200xK+M4xScoLAIkYg2wwu4sFN14LANBAQ0mtNVr3WChXwIs8JT/bWz2m9u2q4Ensrv/jFL0aYrDyXbsvCubS8V2u4/EDOH/7wB2vjvUhrIL++3v3ud3/AAx4w4aHawBDB8g52S9NtYzhhsWQGgSCwKBHo7U/+f3O+RYl7OhUEeoWASBrBOrw8XuYwumH2yJlsHRaGZNOdinceUYnNBm2Zg7XYwNCuOe09D0550Yte9P73v3+yc+2PLEDHRsne5GWDn8mK2Vmn+3bSgWLojkq46g444ICBQ/kaBILAIkagNmfvYQdDd3o4KGnSokUA+RCY3F7q6VUPE/p66jUOwygQS+yAbJtBTOIJT3iCNzagRIiLpez0m275YjlyPvCBDxx44IF2F3zsYx/bLTCw9P3yyy9Xs1AbexI+7GEPE99j84xHP/rRlCHUqntiS9OolnOpV6sqiSAQBBYNApaa9rMvoTv9HJe0akkggLX88Y9/HO6ql1h5tfhw/iWXXLLhhhtusskm++yzD+HES0Z/+MMf7rrrrt4A6r0QXf1GYA3usvrqq6vEdn/ilLvhOzK//e1vO7cuQQRChtRs9bi9eX75y1/Kt57LayW8raJeMVElB/62PZ27+SeccEJ7B3s3P+kgEASWAgK05H52M3Snn+OSVgWBiRHAMETkePmDV2J1S/A62VxHPE05zh/84Afzmnk7epVBQQbkH1uBeW+Xo5/61KdsYziwGJ4nS4XUHaHHI94d4SUYwx43vjCbKKrZYntbLS/nuvduH5MOAkEgCMwYgdCdGUOXE4PADBHwyqrRZ44oUIKK8J3yOnXrQWhQnPvf//7cW1ZF2dTY/oS27VlzzTV5wQbojhN/9atfiZj2ueKKK7r1qOFDH/qQV1LYyrmbP5zmdGuOue7Rgw8+2NeNN97YuzIGKu8WSzoIBIEgMGcIhO7MGdS5UBD4DwLUjmIDk8Hh1Z6vec1rJjta+YJ1rK7yhiwqzkBJ4srhhx/u7Z48WVjLPe95TzHFmI0gnpvf/Obdwvvttx932IDjic9r9913R5i4tywi65ZPOggEgSCwTAS+/vWvL7PMvBQI3ZkX2HPRJY0AUWR0/ycrcLOb3cyOxrZaXm+99X7/+9+LyLFBMzFmoDaPmz322IN/ijvpHve4hzXtXqflJecDoTarrbbawG6BNCH7ECrs5Q+2RRanfJe73GX0toSj3xs60LB8DQJBYNEjcOSRR/azjyv3s1lpVRAIAhMicNBBB62zzjqIiFhmr2jAYGzTbC3ViSee2C0vascuhUKh73Of+wg6Fkyz9tprb7HFFtaCtWKCnR//+Me/4x3v+MY3vlGZ0gKfaUt3u9vdvL5U/cSe9ddfv50ynBiO3VFGqPVwyeQEgSAQBOYRgdCdeQQ/lw4CM0GAACOC2PIHu/jY0+91r3vd0572tAG6o16MxwsorEIX2mzF+FlnnTX6JaOI1G677SY8ubtGfd11191zzz2n28rtt9/eKVaKfe5znxMbNN3TUz4IBIEgMHYE4swaO6SpMAgsAwE0ZRklRh62VMq2OtaHe/Gn92F5K5ZwH0HKw5v+2W5nq622outYh6XKAWcWl5kXmNf6LA6vHXfcUbxOl+s4BVWyv86I5ggJmuzoZZdd5tDAAvjJCic/CASBIDCrCITuzCq8qTwIDCJAYhFzM5g7ne+HHHJIt7iNCh/xiEf86U9/Ov7449daa63uIYvSqTXCfRw955xzBtQdtEmosjKCgU4++WT79Aht7p4+lXStOe+WtGH0hK/T6pZJOggEgSAwxwiE7swx4LlcELha21qQo2d4zbmImcneVFXYeQeFjwge/izvpZImwIjUsXOgxVYD+F500UVewC7i2DKugQhomhAaJGDZNoOEH74nL04nBfmoXP6hhx5qJflAhcv8akMgIdXLLJYCQSAIBIG5RCCxO3OJdq611BHAS4QYX3jhhQWE3ZDlCBnu4iIcZ99997VlTjezmxY7bFMc5OYGN7iBWOPa+YaDzMeycxLOwHvUrULnqxre7s8mgVZdeRkFdmUpFqdY21EQMfJOLvsWDqxd7zZjmemvfOUrylgatsySKRAEgkAQmG0EQndmG+HUHwT+hwAascsuu3hDp+CbER4fuwL+75yJUgQYL3w4//zz3/ve99q5mCCkFJ5017ve9ZWvfCX+hDBVpnz+JgqQJVcVwdPq05jb3va23m2+7bbbevl5y68EuWjCVVfdYnjSZLsI2rPn3e9+9/Wudz397Z6SdBAIAkFgXhAI3ZkX2HPRpYsAdmLLHPE3L3/5y2eMgk1xhCF7ZcTAjl6nnnoqWUg4jmVZba2WjQTf/va3Wy215ZZbtivSe0Qx86ZRm4a5DppilVYrPFnCGjGesu5RvjlL37s5SQeBIBAE+oBAYnf6MAppwxJC4CY3uYnecjz5y/Fkpfdw522lY83UcH43R+COtVTdnErTjfineLu8RnT4aMvxViyXfvWrX33KKae0zEqQgmxRaJOegfzhr9bDD2SKlW7hPtQmlE6E0ECZfA0CQWARI+D9Nv3sXehOP8clrVq0CGyzzTYbbbRRvcEK3eHYGu6qOBvCyXD+QM5LXvISzixbDgo0FsHTPrZa9jJz74Kwq3KdUsvF20J0C87tQOhFEy984Qu1oZ1IeaLrHHvssYSfgWsNf+VKe8tb3jKQr4bK8YZRC9GL2w2UydcgMGcIEDsn87fOWRuW2oWszexplzn48wkCQWAuEaiN+2yZUxd90IMeNPx0sDDqjDPOmEqruMa4q4TRdD8ynXvSSSdVzfYbtE/P1ltv7St/k2geRy0Qu/zyy9tZ0s669NJLp3JRZax+H26296s7hMx5tYXYIM61KdaWYkFgNhD4zW9+Y/IwGzWnzskQMNWpJ4NdRicrMy/5id0ZfmInJwjMLgLPfvazKSsibMqTxet0wgkn+P13r2qJuDLefN7NnDAt5Bl9ee1rX8tthGFUPf6Kd7ZJ4Gc+8xl7Lnt3hG11HFUDL5XwZAIPokPaqTpFFluWZSkWTWjCqwxk2qdneMcdZSyM9xe1su2hLX+8iWLgxHwNAnOJgBflzuXlci0I9Bbz0J3cn0FgrhHACSxBp45QX6yNsikOqmG3wIF2eNPe05/+9I033nggf/ir/ZSt+rZ0fCBw2IsjxDJ7E1YJSAiQ2rCr6173ugPsig/L6VOPMn7f+96HkA20BKMS+CyzoqS93WKgQL4GgSCw6BGwj1c/+5jYnX6OS1q1mBGwX451UjYA/NKXvqSfa6yxxt577z1hh0X2DLOK4ZIiZgTrIE9WYJWEU2UsubrXve5l0z+Xo+XQdYQy2G+wy3UEF3ur+emnn26dV4u8Gb5EN4cbDt3p5kgTmWrNuZCdww8/XKd40AbK5GsQCAKLHoHl2axrdsHx4MsnCASBOUaAM4jvyZs7iTou/b3vfW/g/Q/tZy/gRlTN1Jv38Y9/3Hoo68xbDVxmwoBsmmypVFe/wU6QJH6uqVeupD0SJ9yPx74+VY+Nf1z6sMMOm1a1KRwEgsDiQKCFDPYtdudqiwPf9CIILDgEHvrQh6IFl1xySbV8xD43FdkzrQ6KOLZyyvscbGrsJZ3eimUXZuHDrigI2lpxX+1GOK06q/Amm2zSiFRLoG6ecQrYyZDMYwnYMcccM4PKc0oQCAILHQEzn3oyhO4s9KFM+4PAeBCoDW8sCK/qfvjDHzb2MJDgkJr6JUX8PPzhD/da0FqQIl6nLUevahGgqs2lt9tuO+8rnXrlfGHtRRPdRqJuVYm18fLf9ra3Tb3OlAwCQWAxIdBbdSexO92HdtJBYO4QsDyK+iKO2BItV7VR8lFHHTUhmRBuzNVlG+WpNK4kHFHDFBef61znOr/+9a+7J1ocLt+FvP/civH2vtJumQnT9vh5zGMegz8NHBV27XIyL774Yl3YY489rAUbKJOvQSAIBIH5RSArs+YX/1x9SSPgDZ2bbbYZMcYbOq2N8r5Pi8+5mQZAodPwee2www48RJSegaMDX+22bJ/A73znO4JsuLFsIfjd735XyLPpo5JPfvKTd9ppJ24sdZJ5CD/oi9ggrGignoGvRx99tLdo1WbQA4e8kdR2gjZzoxVxZvU3UHGg3fkaBILAUkJghXoILqUup69BoEcI2JzGCx8oIrVD8QUXXGCF1HnnnTdhE4Xd2Een++qrgWJOVxvlxmosbzhvR1ElIcwieLo1H3/88T/5yU+sXbcw/qUvfWkrPJywDmuy0KLnPve5teDciy8srV9nnXUsB2vbNw9XlZwgEAQWNwJ06NrRVOyOrbz609moO/0Zi7RkKSLA+0No8ZdG4l0z9l+n9AgHprgMw2FROikIcbHRzvBROV5i9YY3vMGegVxUL3vZy+SYz1iabvmVNOmFB82KdCIN4Yf8o0KRxa961asmrE2mkGftsf3PhAX4torrqLDeeiEeKFxnQqySGQSCwPwiELozv/jn6ksdAZvTIAprrrmmEOODDz4YHOuttx5hxtcJoRHo46iXgIr18T7zgTLedSWA5txzz/3Yxz6G9DhK6TnnnHMsdJfmbzrttNMoSSKBHKUVIT2Wo0/2Zis79AgtOvvssweuUl+tv+Aak7armNhk2yjztU34ZokJT09mEAgCQWBOEVhMAeHpSxBYoAgceOCBfvZ77bVXLafCRZ761KeOfhAQUQ466CDeqG6XDz30UDvuWLhOtmn5xx13XLeqH/zgB3WIG+tb3/qWt4faZHlg3bjtmLGi7lkDaWE6rX7NdpRKNK39gdrpSQSBILCYEGivjunbQvSoOwOP8XwNAvOAgCVa4mOQFdHKFjp5lRXJRxDxu971rsla4/m43377UV+EHu+///6KXfva18ZyfvWrX/GX83nRbCy8slGygJ5uJd6CjpooTJXBeG5605vSZrw3FNOy5+Ef/vCHAw44APvx2tHuWQNp5VtO7ePsXGqTyrU//qwGThJBYKkh4LWsPe3yYiKV6UsQWLgICH+xUMtjglhSvcA/rKjyUq3Rzw4OKfSFU8zKKfHCvGAWXqmttsCZ7FwuM/sQupC3oHuNuXnYNttsoxJKz2SndPMRMlHJ1nypweV8ajNl7Iqvjdds4Q5EWh4EgsDyINC2dO+bupOVWd1neNJBYD4RsKaJMGPxFLWmAo21xguqvGuiu6JqdBMF63hTBMXFvoUjti5UideFWjROzvF60dF1TnbU6XSgDTbYwEY+ynh5BWVIQjzQu9/9bi2Z7MTkB4EgsFgRuPWtb+1Rpnd9W5kVurNYb7n0a0EiQHERxyO42AJ1b0qvFVi8UTYPxHv62aWHPexh9hXEyTTPO0o54AREP/KRj7RFkMxlqlP97FRaFQSCwMwQCN2ZGW45KwgsRQQsJt9nn30wHsu8a9c+ETkcVfxcQmR6iIjNDBEdgUfaJujHxhvieLwaff3117dRELdXD9ucJgWBIDAbCPSW7uQlErMx3KkzCCwXAvQSL73ykgc78ZSnSdyxzW8++tGPPvjBD16uqmfnZCSMwLPnnntSp2xauP3222sqD5dF7Bp80UUXzc5lU2sQCAK9Q8Dkp3dt+m+DQnf6OS5p1ZJGwPbHtv57z3veI+ZXNI+P12ZZnb7tttvaNJnSc8ghhxBOeoXRlVdeaWNoLfziF79oJ0N7Q9vVkEZlEyBNxYREXveqwWlMEAgCs4GA3Udno9ox1Lk8Adg5NwgEgVlF4MQTT7SkvH7nr3nNa7rXsp3g4YcfvsYaa1iZNYYHwdWuZvV4RRwvf202DWpNfeYzn1kV1nu7Wn4SQSAILEoEersyK/vuLP+zPTUEgdlCwKtntthiC5vZeHHE85//fKuoNtxwQ8LJDW94QzsjW3lue+XXvva13//+960h96pzWwhOtylWntszkPTiLaH2RLbfj20JPYWnW08rb7vn1VdfvX2tBK+cpg5k5msQCAJBYM4QyMqsOYM6FwoCM0fAsiyhPLWVjn38vGDL7oJeot6t0Z6B1nDxf3ltp7+115+/NsWpjb9simNTwTrFxjxOFwptzx6vRu/WY/dCGw/aj6ebOfU0FWrfffdVHmd6znOeI+wa1xGw7HVgU68kJYNAEFigCPQ2VDl0Z4HeUWn2UkTAFoJCYd7//vf/+te/tsAb6RElw1M+eumTUyybsjEgQaiIyDKx82auBz3oQV4xscyS3QJ3uMMdnvSkJ6E4Mr/61a/a8VlTZXqdlje0d0smHQSCwGJFwHOpXhicfXcW6xCnX0FgjhDwenP05b3vfW+955yn3CsdvD8L6eGQEuY8xXZYRYXWOPeDH/zgRz7yETsTCjcmID32sY9VwyWXXMLJdcopp0yxNrSGhFNue4+5HXbYwXIt0UVWmQkwmmIlKRYEgsBCR8A+FLzwehG6s9CHMu0PAr1A4MILL7QTz+mnn+5lW61BNunZcccdK/LGSi7kw9oofisFRDRjMFhRhTY7KkZn4HVainF4HX300SqR5gKzjPzb3/52q3+yBI5Fwil31Ze+9CU8ieykYcWcJjsr+UEgCCw+BEJ3Ft+YpkdBYP4REJdDofG6BgE3uAuygp1ceuml1TIOLHpPayXe4x2ivgro8dbi1VZbTZqb6dWvfrVQZbtl0HK8+uqMM86wdPzYY49de+211e/96qKhxUFPGL/srCOOOMJGOy7kunjSN7/5TSXtrdzf9agNkSSCQBAYNwKhO+NGNPUFgSAwEQLYBi8V3uMg2tHd7QbLQYDkE3hskFNvMh+ogwIkstjOyHe6050++9nPtj2dhQoRbAYKq1A4US25smqM3+rkk0/217tOt9xyy4HC+RoEgsBSQKC3dCcL0ZfC7Zc+LiEELLMaWGk1rc6Ls7Gjj7Vdb3zjG21viPHQeOzpLLjnsMMO44xvtWFURx11VLmr6Dr4E6blJV9eHxGu01BKIggEgZ4gMJ4NynrSmTQjCASB5USAYIPHHHzwwZZXeFMp+mLXH14wL/v89Kc/7U0RVb8yfFjFdcQjb7755riOl3yhR21fxOVsSU4PAkEgCIwRgdCdMYKZqoLAIkGAz+vNb37zoYceat2WyB6vJtUxFMd+PMV46Drlw7I9j5ddnHfeebiOGOdx7cu8SHBMN4JAEOgNAqE7vRmKNCQI9AyBZz/72daWC+XhxsJ4KgzIzoRnnXVWhSG/4hWvcFSoEAcWAkQZ6lkP0pwgEASCwP8hELqTWyEIBIFJEbCXz/HHH7/eeusJ5bGVjnd4WYFl3RZmYw3X/vvvLxpapoDl6DqTgpgDQSAI9ACBhCr3YBDShCDQYwSK8ey999421LG5jkgda9dPOukk0o51W95l0bd3s/cYyzQtCASBeUMgdGfeoM+Fg8BCQcBLcD75yU/ax1m8zq677lrNRnoIPxtssMFC6UXaGQSCwFJGIM6spTz66XsQmAYCu+2224c//OH73Oc+PFk777yzRVjhOtOAL0WDQBCYVwSi7swr/Ll4EFhQCAjfud/97nf22WdvttlmtWPhgmp+GhsEgsDSRSB0Z+mOfXoeBGaAwPWvf/0ttthiBifmlCAQBILAPCIQZ9Y8gp9LB4EgEASCQBAIAnOBQOjOXKCcawSBIBAEgkAQWAoIvPOd7+xnN0N3+jkuaVUQCAJBIAgEgYWHwPC7hHvSh9CdngxEmhEEgkAQCAJBYMEjsNZaa/WzD6E7/RyXtCoIBIEgEASCwMJDYNttt+1no0N3+jkuaVUQCAJBIAgEgYWHgPcK97PRoTv9HJe0KggEgSAQBIJAEBgbAqE7Y4MyFQWBIBAEgkAQCAL9RCB0p5/jklYFgSAQBIJAEFh4CPz973/vZ6NDd/o5LmlVEAgCQSAIBIGFh8CznvWsfjY6dKef45JWBYEgEASCQBBYeAg8/OEP72ejQ3f6OS5pVRAIAkEgCASBhYfAFVdc0c9Gh+70c1zSqiAQBIJAEAgCCw+BTTbZpJ+NDt3p57ikVUEgCASBIBAEFh4C3/nOd/rZ6NCdfo5LWhUEgkAQCAJBYOEhcNppp/Wz0aE7/RyXtCoIBIEgEASCwMJD4Ne//nU/Gx26089xSauCQBAIAkEgCCw8BA499NB+Njp0p5/jklYFgSAQBIJAEFh4COSdWQtvzNLiIBAEgkAQCAJBYHEgEHVncYxjehEEgkAQCAJBYP4ReNnLXjb/jZioBaE7E6GSvCAQBIJAEAgCQWD6CPzsZz+b/klzcUbozlygnGsEgSAQBIJAEAgC84hA6M48gp9LB4EgEASCQBAIAnOBQOjOXKCcawSBIBAEgkAQCALziEDozjyCn0sHgSAQBIJAEFhUCDz3uc/tZ39Cd/o5LmlVEAgC40Hg3//+93gqSi1BIAhMAYE73vGOUyg1D0VWnodr5pJBIAj0CYFf/epXF1100Yor/mfy869//esmN7nJX/7ylz/84Q/Xv/71b3vb28r88Y9//Pvf/37DDTf87W9/+8tf/rKVvOlNb3qjG93orLPOWmGFFerc9dZbb7XVVpNGMuT/7W9/k1511VXXX399Lw6U+c9//vOvf/2r8te85jWvfvWrq+Hcc891yhVXXPH3v//dBmXXvva1L7300kc/+tG77rrra17zGqdLf+tb33KuelZZZRVtU8mVV155yCGH3O1ud9t+++39ffvb3/7Zz3527733vsENbvC85z3v+c9/vl6o02edddbR5i9+8YsOqS2fIBAEliYCoTtLc9zT67lDAIFwMQbeR6J9bapDOyTHp8iEkl/60peOOeYYR+9whzs84xnPqBYr8IpXvAJBQReYfJ8nPelJn/jEJy677LKVV17ZuXLqkDQ+sdJKKz384Q8/+uijXZfIfJvb3Kbq6f5929veps6Wo9i3v/3tk046aauttvryl78sf8899zzhhBN+9KMfvfe97z3wwANbyRe/+MW77bbbJpts0nKe+tSnHn744b7+4x//eMADHnDxxRdLu+h555137LHHOtfMT4P14k1vetOFF17oKl//+tdf+tKX7rzzzpttthl+I/Ob3/zmmmuu+f73v/8pT3nKuuuuizbd6la3euQjH/nOd77zjDPO2H333e90pzshSW95y1u+8pWvAOeII45AiR784Aefeuqpr371q3Ey/MzR888//+lPfzoO9OQnP1l7WiOTCAJBYCki8J/naz5BYGkjQHj41Kc+RR74yEc+cuKJJ3bBYPiRCWa1MikQn/nMZ44//viPfexj8n/yk590Cw+njzvuOHIF/rHvvvs6+rvf/e7mN785MYNdRyboDdLK1InYAIpQ6T//+c9bbLFFPZLufe97t5q/8IUvOJEo4tDtbnc7ZT7+8Y/TPDbeeGOZ17ve9eqsO9/5zre+9a2Vka9H8l0Ib2j1dBP0Eh2k0yjzuc99zlefPfbYw+naRheRKI7yxz/+8cY3vjGVxdYamM1ee+2lnsc85jGbbropZF73utehKXJoKjvttJMa/vSnP334wx9GvF772tfKR1nuf//716VpRViU9OWXX64j3/3udwlIro57yXz84x/vol/96lc/8IEPSNTpmJD01772NZnve9/7KDpKIjoysaKf//znBksaF3zc4x7n0kiPr5jW0572tAsuuKCum79BIAjMKgKHHXaY352Pn/OsXmi6lf9Hc84nCCxxBBj7cqMceeSRG2ywwXve854C5PTTT0cC/G6xhwYRoYW9f/e7381LojA3UDvUTeArlA9iBmUFj6GyEBsUwBsQIMwDC9l88815fy655BL5XiN8y1veEjtplfz0pz91CVf/0Ic+1DIr8exnPxvtKBNem3ox7c961rPuda978R8RhPQIObjFLW5xz3vek4JCX6HTDFQy8JW+8qAHPahlvvCFL3RpT66HPexhEsVsHEXXsDe9e85znlOZeA9hxiGFi+5QWZyCdlRtsEJHUL1TTjll7bXX1veTTz4ZIK961asUwEv0GmtR5hvf+EadUnTnvve9b0lHRXcoWIbpHve4BxcbjadK0q4wPORJe84880zXhSf+Ss1CdHz19/vf/z5SVeXzNwgEgVlFgBbrd+fTN7rzH299PkFgiSOAJXCUeAQ88YlPxC1oA2QbmOAQfEO0CrynQcTcXuMa1xBZgm3gK7vssks71E2gHapCC9CI/fbbjwTCsaIA005EEcuy9dZbYyEYyRprrCH/05/+tArpE8hW1YP9YA/StJPKaX95qfiJTjvtNDnI02Mf+9gb3vCGIlp0wSHtV6eaMS3E4j73uc/tb397fWynT5jgC+Ouwrq6RxEaAk83p1xvSNvBBx/sWg5hP3gMhemZz3wmj1K3cKXREXj+5je/ufvd707CIaF973vfW2uttUTYKOC6cMYa1dPO5XsiVv3gBz+gM+lgeQDrKPBdt/ouR0mkkOTmPcxvfOMbsSg4bLTRRtpZ5RXg/FJJqzyJIBAEliACoTtLcNDT5QkQYESZSYIKTYKEwAYrtOWWWyINBx10EF3BV3oJu9vsqBw2nmjhxAlq/K8l3n///euQCjGnCYvJxI0IGDgEDxQ/F5I0WcnK5zkSiktT8ZX+xLPDL8auoztyjjrqKBEtEnxY+iWTWNJtdlUy8Fdf6E+EkMrXWpwMV+OBQptaRNETnvAEscY77rgjdYquo7CS6AXvlbTAYX8RFx8qS1WlRxLVNqKRqBqKFArYmoRQAuoFL3iBfuE9CqsTN/rgBz+IutFy5FdV8oUZiS7Cw37xi1/IVImO05zkEOeqWP4GgSAwXwi0h958NWCy64buTIZM8pcWAlQBphTbeMQjHkGDLS/SS17yEnaUfeVVET8r86Mf/Sgy1KDh1cIPqDgtp5tgicuf0s2cMP2ud72LssJ+YxKCcEUFTVhMzFCtdSJvoGWUmwmLtUy+NlG67evoBB2FM+t+97tfFUPsUBZESpTSXe9610ZNRCujUEJwqFPcTworeZ3rXEdk8UMf+lBRMnJcF0/icZM+55xzdMfiKd43X3n3hBLjdhiSr+3j6gQwIAOcDqSDKA66KeZJgj/OVT7/+c9jliBCubgXMVGxRBpJEoPG61//epU36imoWSyR+qlQfFvtQkkEgSAwqwhM8aE3q22YsPLQnQlhSeaSQ4CZJG+cffbZGI+/+k9yoGqw2TQDvhhUgN/qUY96FMPc0GGDpcsb1TJbAoWiT7SvlSDz8ENROLr5pXwgEAw86oBhdI9Wmr8GmSiVpchHXb2VdIjU1G1eO7TMhO4jGdgDN1NFBCMQNC21oReiguheP/zhDxEyoTZaK8jGSnXVUlaoOMKH+ZKsveJ94xCkEpGXLLkCph5ZWC7UhtSk/DbbbMOtBur6aoU5EonDKaCweSGiY0ma+GukU5gUpcpwUJIE8bzoRS9SAzeikuopEJQ3OuUX0wbMDBtTodcyC2BCs9SGSjoxnyAQBOYAgeGH3hxcdCqX+I9in08QCAJMOCJCHqAcWEqN1qAdMhlO9hsP2GeffcTDAoqBL7iwFuu5cBTLqYYBtGkNeiQQR+yL2BT8gLNGMcusbnazm6FQCISQ3gMOOABrsdTI1Vl0XidRKfIJS2oWvOIUtry2t6E/KSbCV6CP/He84x2WTdlmRrNFQ+MEGAndhY9JlLQC5A2KiIRFTMgBt5T08MdqL/RLVfjZXe5yFz3Fb6yoUhLHIp9UJVaEoUS0GfkuWplCgO1qQ+YhejkdX6FO6Sx/EwRQIiQMBxJdVNfVfvWLpNFmOQJ0hBg7F6pwwF18iEPWk2Ng4pddjmyjZhRHzBM25iwldQdn0jbeLkpPRVDpu09daDKFrI7mbxAIArOEAGX3la985SxVvjzVrlDTyuWpIucGgUWAANrBxHJLieoVb4vr8BzhHNwobDyrzOISaTEMv2RMRQFMiA1WrK1EGMDBj0t4MqohH5VRlbOk1UMrIp/wHBGQGG92nYiCpghkttkMk08x4lbDQjADEosyeINlTVY5KYA/yac/YRtvfvObnSXuR6tkigXmVCpqRd6wpY3auLTwAPxjoIX1FaXAS5zrqzZrJP5RkTdCghALopFDmuSK1B1pBUqekSYCIStipaXzCQLTRcD9g6k/8IEPRF6ne27K9xABfmr7SmgYgdaw9qeFoTv9GYu0ZN4QQFxs/YIoePJahEWNsMZHhCz+wU1jwZS4YIxHOIhFUhQUVp/24IPEOHFEuwUd26CPFkKesSKpNq1RnoOm6M6Ic3MoCCwFBES/0U1FyNm1cin0d9H3UWReOZdDdxb9WKeDCxIBfqJSOrlI6CjUFF8pOmPpDMJkNTglhsBjudNY6kwlQWBxIOCHZv5AIOSsXBw9WuK9sMUXRzwQ+kZ3EruzxO/MdP//EBDf2sUC4+l+Xc60HXRoPNxV4TrLiWROX3wI8JbGjbX4hrWHPQrd6eGgpEmLEIF61+Yi7FhvumQhmNkk20moE28uBquFOYuCEt4kBMqi9Ote97q1Qr43DU9DgsCiQsCb/vrZn9Cdfo5LWhUExoAA95n3Kggw4qGbrDpRz9ZVYQkK4Ae2ybF4imdB/LUtcMQnWWYlx+Y3iimsKp4+MdpWmzvLhzNCjp2LnWUtusVTVpWLbraird5BgX9YbiZq29Kwl7/85S5kOx+7AtYuzAKrRT2LibbpjgsNtFNsuLBrUdKioFzFJSy5srG1dgq+dmlB05/85CfJZl6wZXmaNlPmbI9koZldkSxhEyZlKRy6AwoLx2yhFLozAHK+BoExIlBxymOscGxVeV7kEwSCQBcBNlXwshx22sfXOsr2+8rwdwtXpvw6hUn+zzmdTztdgZZdmbbsq5dL4AeWjlvWrmZr1C2rRg5k2qqYwa7L2VTQV8XwDDnePCVtyRWW4F2bEkUyum2r96hjHupUWOS1ZdsCCauMl3H6KtTauZXZ3e/HArQqplqPG5vl2I+nXPKq2nbbbQeeQXbKsTTMzjfoi0OYjXeeW4OmEovSLb+3laJtoKtai9VREMVwlEMOOUSMFApSlxv4i6N4pbmSlphZlWaDAIgpoy8PechDrHoT4uq9Y9qGAOE3CJa1Y7iaRlZVmJm3WEjTeNSDvQ1cIl+DQBAYIwIiFP3QfKitY6x2+avKK0KXH8PUsLAREFVjzRRL743oesIAYxu2n7FNsIVUVmV7R2btrWe5lt+wXXbE4qE1Ctu/jhkmeDC0TpFj9ZZABKuyqQ5CLx1ijyufgRchJFMBNp7Jt0Scz0WdVpiTNxSuHQjL82XXYCTDSnWn89QQMCxfZ8gRFFvdoDtOca6Hi217qCzyMRuF62PzPaoJeiFmyOZ7mkTzcCE771UBvETviDHFJ1Qiv14iAYrG0vADegklxlE4qNM2YqiSq1u+LiyJZKKqer25MrUnEKzqKoQcjaxe2G+QL6neAF9vwNAkfBEsFJoqP/y3Jot2OPTOc5XrL04Domqh9XT6bi9BJ9qxkLtKAlA2/6iqkCENqO0NJUJ3hhFOThAYIwK9pTvjjMf0KMknCCwsBAgG1AIih51v6AEaTzthm1EEvIQ7hj9IJieLd4haXcmoc47QKmxFKJ+Yge540QHXDFsux27C3htqKbstfBwlLbDK8r03ijFmrW2Qo3I5lB5CUTli0AXCCfkHn9hjjz28cZM5Z7NtnMNs22zQ9hVqxkIwLRey1zD2Q95QDwOvjAtpbREgmT6UDzSCCkIW8hYFpG2HHXawS1ARLNsZoxFYgjXANmvGwFzaWauvvrq/uFFblcZzpEfaJh89ko/9oG4cTKiPr9bYQ6CVr5IK10e1WKMW+gpSaFiGc9XB/wCoNsgPnNUKSGAz/qrBNkVe8A58NEtOXdFmQqgYMOXoBSYETByo67HCI0GXV0mAKJ8gsGQRGPSUL1kg0vEliIBIFDGtDCQSQ5WRZoxZd1Cw5fw1PnQU+/hx0/D4oB2kAtxIPmLBU/PWt77VXy82dwphBo9hcX3FY+xViPQQjSxBt8UOboEtCTfxEUzDeFudbsNAtAZ1cCJfD5JB/FCVhEyyRw2KmrGusu7kGeyBROEQhclXYTG2Q6xNAqu8vyrnn7Lfj6qwBH4c+zsjKzxTPGgK8P6QgihS0mQhTKhoma/L/Ij55VarDcRMCjXemyI4jOpEQpeE/NH1iMXxulBsT1W2dcZRRpfXayBMWK09r4sSAQT3UgyH84bRqtCFbCBpo2ojawgmrGH0pXM0CASBRYBA1J1FMIjpwgwRIH5wOQmGZfsZfmG2VBx8BedgOKtSJMOnbCSRQCaygmFQg1h9XytTgtNKhRLsvc0JbS1KYyBdyLFLIX6D3BAnBKMgQPLpMQQYdtq1MBtKiZZgSMoPfOrqaFbl4zHe7YCsICgq3G677dTDCcU11k4UEawx9cJOmXxP3Gfcc6QRek8r1hLIWetyyxydaB2vYtVTaZoTHatxNR0kvYBiuDZAaSd/FrUGyMMFKqe6j3R63XpVC08ss15z4VwErhrjQt5WQQCjwKGA7XTI8KnxQqKh3m8/2YWSHwSCwCJGIOrOIh7cdG0ZCAiOYRopNMwzZsAh5QRcxxKkCaUOlpvpZaF5XrxAirKivEw2GNdhSjl9xIj4utZaa5ErhNeUqVbMjrHEleJVOBavVoX9OrTMj0sog6wgJRLIFtZCtkFQXJR0gSfJb4TDdalK3nuFUcm3DTTnl8tRqpAerjphQwNkxXushOAoPPARZ+Oi1YCBQ1P8ipAB09u7RBFxpeGIWI6YJ6FFquWwo8SggCNqQ0Md5Rzcb7/9qhiHIJ9jhUKTvrwzSyXCm37961/rOGHMaLpi8Ug7ZYvNsik2yiWHf9ArCzDOEVfMoSAQBBYfAv+ZeuYTBJYmArww6IigWoRAgAvvDCPK90E5GF4RXRApbwkSy81glyDBZsskUch0omIoCH+WF3my6N52KcAWAeIv47URqiIi2CUEzUwdc8xGe1Rbp4gUtppaWrWYhKO1ZXurUOgP6lOvzZLJZydyCDlg7ElBXgHG3gvlIVPx3wlVxnWQhuqyfjlFa71M44gjjiApuS7py9ouTMJ6MVxQtFNdC/+A2M9+9jOXqBxIiuORAw1rwi3NsECdKuM1n1gIBEhcGKFgRrwHR1R/vZpUX6qG4b+ooUyxON39PLxLFYzedKZVlmsJOXJp4U0EHsFMGuzNo8qLQ8eEKEC6wyOpy7yHrbXD10pOEAgCixYBT+p8gsDSRKDYCWlB9y38IfaI9pUW8sJGSuAHCAFWRA7xCGDdZXKLUICQAOaWxoNVyCSfoB0SPrQHMk+lLXpyljXSTmebZfJzUVZIKVVA3K5oHj4swUNEI5kMNjYgApc2gx/UmiYN48HZddddmXZtY9rZbAFGMrWW74Z0VIvJtYqgUk2tS6AyzLwGFEnCV3TcIWygyTZIAHIms2JoGtsToiQURg0kmXoIEpmqWpwGqSr/HaZYmWQnxRQuzxpqKF9sDW+a151SoYpiAoSoJodyo4C4JcFGVcPwX+0UGK6wXnePGh0BSdWXbn7SQSAIzCMCvV2ZlYXo83hX5NLzjEDRHZIAlw2BBF+phdz8Jqw1UoINUEGqlVZmtUwvCiUqyGduMYNyDAkHllOrposHFBXAbAgqFpDf8Y53lI9U0UiwHIVxJnvSYDCuIhhIRI5MdGrHHXckw/gIN3HU7n9Mu0YiEyiLSyhmnRFPEHaCA/lqPZdte5AJa9eF64qqltk+lkdZz8WZ5S/yUfnqVLNPN5Prx9fK9xcgipFeWo50nd4ylXdWZXZPl4+6tTYkEQSCwFJAoLd05z86fE3a8jcILDUEGGxrmPEVSoOFRXwrdBGMBA5+sUUL6DQCcQoZPiDmXFphykplckvVEm7ncqk4S3SISrAEBfy+LPy2nosygS3hKHQUsSZ17tj/8t3gQBw91KaxV54Kg0AQCALLREBAZAUm9u0VoaE7yxy7FFjMCBQp8ZeQM2Jx0EKBgAKEtInFoQktlDannUEgCCwmBHpLd7IyazHdZunLtBEoLaf+Tvvk/p1AduIRq60C+9e6tCgIBIEgMG8IhO7MG/S5cBAYOwLdXZXHXnkqDAJBIAgsXASyEH3hjl1aHgSCQBAIAkEgCEwJgdCdKcGUQkEgCASBIBAEgsDCRSB0Z+GOXVoeBIJAEAgCQSAITAmB0J0pwZRCQSAIBIEgEASCwMJFIHRn4Y5dWh4EgkAQCAJBIAhMCYHQnSnBlEJBIAgEgSAQBILAwkUgdGfhjl1aHgSCQBAIAkEgCEwJgdCdKcGUQkEgCASBIBAEgsDCRSB0Z+GOXVoeBIJAEAgCQSAITAmB0J0pwZRCQSAIBIEgEASCwMJFIHRn4Y5dWh4EgkAQCAJBoF8IHHXUUf1q0FWtCd25Con8DwJBIAgEgSAQBJYPgcc85jHLV8FsnR26M1vIpt4gEASCQBAIAksNgVVWWaWfXQ7d6ee4pFVBIAgEgSAQBILA2BAI3RkblKkoCASBIBAEgkAQ6CcCoTv9HJe0KggEgSAQBIJAEBgbAqE7Y4MyFQWBIBAEgkAQCAL9RCB0p5/jklYFgSAQBIJAEAgCY0MgdGdsUKaiIBAEgkAQCAJBoJ8IhO70c1zSqiAQBIJAEAgCQWBsCITujA3KVBQEgkAQCAJBIAj0E4HQnX6OS1oVBIJAEAgCQWDhIXDmmWf2s9GhO/0cl7QqCASBIBAEgsDCQ+DQQw/tZ6NDd/o5LmlVEAgCQSAIBIGFh8DWW2/dz0aH7vRzXNKqIBAEgkAQCAILD4GVVlqpn40O3ennuKRVQSAIBIEgEAQWHgI77bRTPxsdutPPcUmrgkAQCAJBIAgsPARWXnnlfjY6dKef45JWBYEgEASCQBAIAmNDIHRnbFCmoiAQBIJAEAgCQaCfCITu9HNc0qogEASCQBAIAkFgbAiE7owNylQUBIJAEAgCQSAI9BOB0J1+jktaFQSCQBAIAkEgCIwNgdCdsUGZioJAEAgCQSAIBIF+IhC6089xSauCQBAIAkEgCASBsSEQujM2KFNREAgCQSAIBIEg0E8EQnf6OS5pVRAIAkEgCASBIDA2BEJ3xgZlKgoCQSAIBIEgEAT6iUDoTj/HJa0KAkEgCASBIBAExoZA6M7YoExFQSAIBIEgMBkCf/rTnx760Ieus84622+//V//+tfhYuecc86zn/3sK6+8skre737322STTT7/+c8Pl7zsssvucpe7nH766Q49/OEPf9CDHvSQhzzkPe95T5X8wx/+8KEPfei+973vBhts8JGPfGT49OQsTQRCd5bmuKfXQSAILCQE/v3vf+MBWvz3/34WUtOvauuRRx652mqrPe1pT5Px/ve//6rs//1/2cte9olPfAITevvb3/7FL37xwAMPvPDCC3/5y1/+r8RVqb/85S+rrLLKLW95yy9/+cvf//73X/Oa17zlLW9529ve9uc//1mR1772tU95ylPe8IY3YEsu949//OOq8/J/SSMQurOkhz+dDwJLEIELLrjghBNOmNCO9hYNhn/33Xe/3e1uRxq5//3vjxCMbupvfvObr371q//6179GF5uzo7/97W8POOCAvfba64UvfOEee+zxohe96PLLL+9eHYPx+dznPnf9619fT3/84x9vttlmT3jCE9773vf+85//7JZ04pOe9KRtt9129dVX/8IXvnDta1/7zne+861udavb3OY2Z5xxhpIEpEc96lEbbrjhK17xio022uhvf/tb9/SklywCoTtLdujT8SAwiwhQI/gUGLA//vGPl156qWn3hP6LyVpwxRVX7LbbbuWtUOYDH/jAdttt97WvfW2y8hPmM/auO2Dyf/e732EM3B+f/exnJzyrn5lMOGfNJZdc8pjHPObDH/4w6YK/pjWV8IMENFqgy/icPgIf7AaiHQKskgNjcd5552EVrbbZSNCkeKBWWGEFlfuL/QyILo997GNvfOMb3+EOd1AA47n5zW/+1re+lU7z5Cc/eaWVVuo2ycB95Stfee5zn9vNlNY1PZV46lOf+o53vEPixBNP/Na3vkUHks4nCITu5B4IAgsPge985zsV0/DTn/70qKOOqkltr7rBpD396U/nvCBF7LPPPte5znU23XRTEsUUG7nyyiuzwTvssIPyQjrYwi996UtTPLcVo+KsscYaP/nJT1qOxA1veEMaw4tf/OInPvGJ3fyFksYS1lprLfzPp9xbCNy97nUvUL/pTW+qXhx99NF6h9Moea1rXesBD3gAquHQZz7zmXve855KUol+9KMftS5/7GMfe9zjHjerd5EbQCTNySef7KKnnHLKiiuu2K4u8b3vfe/Tn/70lltu2TJ/8YtfICu+fvvb326ZEnjbM5/5zMMOO2zVVVf1la6DSH3wgx+UqbybrRXGfp71rGep073UMpNY0giYhOUTBILAXCLw85///N73vrdQykc84hH3uc99iA1cD1NvAKfAjW50o2222cYpD3vYwzy/GO+pnz5nJUkLd7rTnYg0rmhGbsrOSk396my2rr3rXe869dRTJQSu1rnsPVtYH2mZxx9/PFNKDOABYc7vcY97nHTSSfK5Qpy48cYbs3n+cu7INPX39SUveYm0D5lhiy22qFAPZQyH0alDtKVHPvKR3CKgJlPJFPdq1LhakAkXeuUrX1kl5+YvS3+9611P1IvLnX/++bp21llnSWseqQMDZvvf+c53VmPEwVA1PvnJT37qU59CHGVy3t3gBjcgC+mXSJeXv/zlVdJfzAATIo20nNlIGDLIP/7xj3f3EnIwsHYVI7Xuuusai5YjQaP65je/ufnmm+tLyz/44IP5p37/+9+3HOE7H/3oRw2NO0SMc8t/1ateBatp/bLauUksDwJ+fW5OH67J5aln7OeG9ta45G8fEeC88Fg375xK40zlPd3MZX3EJ7773e+++OKLb33rW++0005TOX0uy4g28LymOjBFVHdz0zXXXJO1nmIbLEJhAKq80E7W9wUveMEyzz3zzDPxBq6BESWZT46DcigoRozBNoRQbL311m984xtpCQ984AOJNCNq6B667nWvyyVh8i2TY0WvB/xK3cLD6RIAaBg/+9nPukdZ9Ap3lWmU3R4Em9ve9rZ77rknh5dGolnsuqM3u9nN/BXSoQCpQ3t8XW+99fAesS+PfvSj73jHO5r63/Wud33f+96nGJ7E74Y7qlZJbiNcTf24KTZAavrhD3/oNhNF+4xnPINQwXIrNu8fxAUbM76/+tWvmpS1/vrr6xrYmytH0AxYrnGNa7gNfvCDH7jlWstluqna11lKGA7UxLAed9xxxpTIVBfScgqlcGPD1L00gdC9J9Nt2fJRN9MD1LnlbLXVVtLHHnssbtQ669Ghv/jTQJ3trCSWIgJjJ1CpcIoImOuwBMOF/UTpt0z18KGx53jqffeqDymbU3/sl5hWhYwQA8MyOcsEGnFhwKZYgye+ibjfsJmrU8zCpenbUzx9ZsUMYk2dp3u6UE2ChLNM2e9+97uzqVOvwYjp2te//vU6BVAiVKRNZJmNW9ziFkieGbwcRoUJR19e97rXWcbC/iFYJCWiCwaz6667YgkMedVDq+AT4fi4/e1vX9MyTgdygvIGBXExIz/iiCOq8BT/MqIYEpaw7777snam4FM8UTGhKiyWvnBbMI0aXOfuvPPO0BPFgshSCyqz1A5f/XyaPGNoAHXuuecOXBShkd9tDIj2339/xVRL/pEQy6K/1JGzzz4b3SHzyMQn/GQM+k1ucpPDDz98oNrZ/jqs7ugazcYAIV6AuulNb+pXXM3gn/LzQf5aq9BivabunHbaabgaF1g7NGcJMhK/lQVTxxxzTLuogeD3bF8lfv3rXxOrODFNYMQjG4I6SiN06w4INu4u3ZGPmlcxBVBYtzeepB7j1a086dlGoLfqztVmu+epf0IEPIj9IFmpN7/5zSIJnvOc55ib2nMC0fHw8lRqYvuEp48r0xTQtdrHlGhEzYSEroY8ouSMD3k6e0Z7UqvBxE7DpkUmyOPECUGaTi+Xf3vSebhDWORjtc2TEeweslZ2HHLIIXvvvXejegIgGDz6gUmwwqyaCABpZZT3sPYUlnje857HGKMsLB+txXO8ap7i30Z3+F/El1hpMsUTFdMGyKA1uqBTfDHFm0kOKIUGW7RSICiMzZglI4Jmz5QkOZrtdLeZND+IJS0SKAIfATPDPYSgkDpk+oAF2xiwRnVoKn9NxF2LpIRLEU6mckorY90y80ZVssRG9I/7oQ4ZSl9f+tKXAp/vqTK7zp1Ww2R0x9jZtaUVkximOwgiqoccWN1jRbRIqSoPIpbVmufu6XOT7tIdig4vD7UMtuSNaoBO6Vqli+4YTQ12M3NUcfkZC+RYASTJzYMaVmEkD6Ttx1KZY/9LfSGYaYMx7VYu4OYb3/hGN8fvVLH6VIPrqHhtzqxuSWk3sLmNp1PL9/B0rtu+avAbaYeSmAMEQnfmAOSxXcLUlijKQaBGTuXyNLfn3Vgu42FKQPb8qimXOD6zcD/Ogw46qEIW9ttvv7FcaHQl9QS0eIGgwtU9wtWqgKkzEcLslhY92s1PnVZmuuZNU7UELMxbeRMAMha6w7V/9atfnT4huEHgC75iHqxyD0omDdugjZe+ggYR24Vl0EJITZqE6yhJXWD5PFWRG/2idshUktkjYCAE3UCE0ZjXUQxJDWau+stuTQurojta7kSuJVbfjaRaZFTvzO9ZMkS2LoQJubus4mmtYlpcumQzRgLaDvHvYIoIt2p5nZj/Ko/ugK4pQK2SKSbcLdMSrrrVojsf//jHtZ9zisUqugMobIxdZzuNwoC6U7EsrZLJ6E6pO11MGt1Rv3AoNOL1r3+9GwNXUBvSgHg5RN0BLPXLRIVfpnGLdsVZTRh3dxouzuobrArTwc6luWw8piTck+hsNYNQpzyNyk2CyrhJ+OPccpx67mQdbK3F59wSZggtZzYSfiPcozhW3a51Cfcq+jhwOU8baCvpbxOoTEuAb2gGCrvtSw9u+YTeOr1qKO2zHZ0soRKjPN0f8mS1LeX83tKdxO74mQ9+6C5WLtDSqeWmj5aEeNQ2r/Bg6el/97tCazy2PJvQGo8tT1tGiAmU78fpScTM+GGXsbnmNa9ZF/HoZ5DYVx/n2k+CqMBeeg6qRyYFokpqvKc597yJVNfPPdBYj0g5ggSZeTM8Zp4qICiEj5/bQoKdoz8Jd+BAof14slAOrOmwDqItAxmo01dkTjCEEIfyKA0XmCzH44kJ118ImNVp1WQlJ8s3TBrmseX53sp4/EHGbh/66NEvrlNPSegAZzKRIVzWEiflmU/BIqI6rBMRpSGH8VAhLcFz2Y1hJgpwSglnh3zzZk4EoLVrTTHBggKZ5sHBpM4aiCmeW8WMLCYhTRliEoBmsPgpDj30UBXWitxlVoj0CHqoYowQc2LQ8Rs5brZKTFiJoy6KSE14tGUC2W3cvraE0/0KGOMRPyv1+y3Ah3l2q2ue040LTkyLMl4IB6tGeOMNdMM4yiK690zrYeur3wgGgCpV1I5BxO3cvbWE2yDylWD5fB+IkdXdhliP3Lc26sVu3fmoj8p1wU/AhMRdzbGCRvB8qUSi9WgOEtBArzXmbne7mxtGI11U71BwRlpnYQWlyneIgkhdQ9pKMJbjVmeKICAf75FTH7+OtddeG8W/KmNK/93DfllNRDGUhtt96AdimLS2HarqtNC4a6ET9UJ5aV4to6M93fukBq7OUsYhXbOdILpJp1RJt30OKeDTMov2ta8OqaR9RUSg596TaJmuCEPj61nqJmn5SSwmBEJ3JhhNDwhPNzNLDz4TOJae46AC4pRmVC666CKPBg87QaZ+ujL9sH08oLn52T85JATRFX5FSg5Yfc9KNMXaWsW6Pzlf62N6zZiJQVG5p61nhyap3Ck0ao88nAAJI4HwMihMe1CMHG2ib3mCJzJfPl5CrfEEb0rJVdUP/vegZ+TYclVpP5cHtuTqbCczo19OEOQBAZfmLpEz8MQZqJHJ4WEZ6PVAmQm/6gXkCfWmpLojEmXCYiMyNYzPXt+RM1hVSRTHQJDrmCgPR5ke03R1XFYal8VXqptGzUyX7RQQQDZvF0KPpIW24Bae45VfglwrM62Ey7H3MLcaaFontsKicIruyGE53KVuD83G1I2gjUlayQkTAJHvPgGUBLkL62UVDByNhBFVFRVE+A5I/RAwUXy60TKGDX1EOLjAJqyfjfELYsw0DO02KKS1VtINxgvDT1RCS8tvCQPkZ4LNaAbMkW+HBNCoEDvh/PUVN9U2to2JUhIable8n/ZjLqGAX6J7G6fRGPEiZcYIWmB3rt+g6QQc/AbV7Mbwc/O3UPWzdWkVKmawyCf+mmwg+jJ9zExGcLXWkTEm3G/Dt5xW1XL94Qv5wfoM5AuB8hnI5EE2QAOZy/xqksAZCv9WEs7a46tEy+wmtMctBHA4m05gro4Od8rpAzUYaPePgfOr6VYoXU/gbqbb2Keb002rym/fTWIQu/la7qHnGdjNTHoxIRC6M/FoFgshp/NlmDzVVzNmvxOeGssdzYNxDr4DSyFqmuJXZFojn4Wg2ZINKOFqJ8ULeOw+YjjOCfKmaO3arKzfPDODVch0OQ8Rxh7L8ZYZBsnjGLNRD4JFrfGUEYS7yy67MMwa4Jn+4Ac/GBER4Ox0goRZO4vC5rFnNOrJbGo9m8wCyTasvnN1gXlwCWTOQ1BoSz3TUSi12cajJsqt5RMmFCt7M+HR0ZnVd7aQHS3YR5cfOOoUM3tTzJq8MmaehuwBKkDUwQXLe6jjzJghc7pYEH/N9thXBhjzEwyL69TwVf2MvYT5JZKBAUPG18qU8GxVSa008RBXOcdZ1wxUJd2/5qkMqkc/oQKZbofcA/ynwGdNW+ZAwv0jB+f2FxEx0O4lio67BQVBdwSy8EnB0OjTDuWbtlK8UIG6wZzI5+Wo7iDWvuI6ZtjMOY4ouhPj3HHHHfXF/Y8JuSUQDgJkozvuELNhSthkdAcIBAN+H8igHSW3uFB9DJB7koB3VcZ/jBbywQKxZzVq7m1jJIpIpqsbWd3Ud82u+1YxCeUdQsSrKmbScPgB1lc8tX56LdPpdQ8oUKdTdBB0X3EmXAotluaG00gdx/+A46dH7HGT1KWrcvePscPq6utS+2s4eMO793kDpyW6mBgaD5n2APH7QoWV9Oir3063cNJBYOwIhO5MDKnJpTklwRyN8MSvp6enm583zcMv3E/ds54sTCjGgVgpM12CBF6iRj4Oj1cnkmRI6ExauwyGZL0MGtFyJEwpTFzYpzIAnvuqYqE96D1ziTSqtdaGoeVnYQN4GZgHDw6TTnTHelqPfubh+c9/vocINUiBMjCsRTNR3StWuviEaFmXQHQoRtQdz3SWz1QeZZnj+atWwU2nxNuyN75WCyVgYu6FRGKf1fjhv1W4+1cIkWmrYWK5eT0knEWfA53hQyU9bWHrilgpq+ao6yKazCc2w1obQZmYpfGl6yAZpCMcwj2AFVkSAi4FZJKIiFIOYSqIcteWK9D9oMvoIx6gsLFTeTuKdxroZo9bfjeBzTDJjIeFOZilmw270nj+O54par9D7gdI+svAaKocsPja6iF1DMh+nDs+bjk3T5kr9xiDVHqYE3WQN1CUj7tCGZZe90WVOqVV2024dV1FB+GGphe5bAXAjoRVpoahGnh5eamqTLVWs33KplYvWg3jSvjp+ZlP2AvNQxO1BBR4z8AVIdxdzj1wdNF/RaPhNuNuunXN+mZ8ek4MAtNFIHRnUsQ84F796lfzg5gHi7FQrkgPY8auePLWI1g+28BfbsbP2rENxBhGwjPatFslNNhmoVk4AbCoDEvQvbDYQ7a8m9PS7Irpo9mty9W0vkxRK9BNaJWHiBzqjgkrEibHLFxjJjyrdjRhX0n9SlrRwAOFImAVpvJUHwyMVmR2bjaPD5n9o2tsuUBR5t+FzK35BShP2GFrCecCKqDYsEzdykyY0GYMAKXTDAbStNtXNlhhO9RpD/dTA3O4BlHeoqDMwjFCOjmq568uOFdEix6hERxSzBs/BXUBVhrPfUYKYsi5sfj+cBFmTNAG0AyTvwI7ioSRTFBDJtCJOAclQD2uogDpi0SHa2IbeBXWQgoaMPCtwW4SJFgZ0ogy1JQ6VAYeC5HfMttZEgYR/cJg9IJ5VoNMyMvEQgwQUoiFSNRZA+NeX8sdgNrSgQy6zCqscl91R2dRPRTQuEvXUX91WQv9CurOVwCP4Tib8NZqdUq0S7SqJJzVzVczWMzyh+UivB9Kbumint1Klj+tyygLZXT5q0oNQSAI9BmB0J1JR8eDnoRjn65SPjADcxHPYvEfDHCtYnCyRzZOY76LJbAf0vRzxVgFhIAdxQ94HNgnRohowZp2p0QMc2uBJ28ZPM99Co34we7ckVUTN+CJrx4uMza7nWiWzBFjZs9FwmCw7kqyRibfbLB6hHC2wt1ErQ9qcyztd1RgBLcdlqbx2Ab2RvAwTVc/EuAvDQkfqnq0mVCBEXarZf4JD0xvN3Mqac4mSNIqFNY2DprmfNEjsJRZctEJayNHkTqMiA+5BX2RUFg+NEhoKCm+IsRYvsm6IePCVxWz7ai9YVyiW7N7gFWuQTGyqiq5S9ooIDcIFpcNS4zFNnnAKUa8W89wuiiCZkx4aML8Kin6QZvBa9TaFbuVCLvpfh1O645e6NowG5PJj6ZaI0gucicPn56cIBAEgsBCROD/e7gvxA7MRptNmvksiCJmzAIbuXWYN84OHi6G3+wZZXFdkRAsPac+44RnEBLIJAwGCYTZ5g1hCNkVIRqiRlhHZYSGiCMRUNyaTcOQdi49QyAFo4ujsM3Ks6A0gAqn1QBmjI2kGCmDcxQnMNF3CaKRSqwj4wvwUZLaRNXQQoyhOcvbRVtCABDb1kwvJckhvXY5woYG4Ell4OVjfiqX6OorCARvmga3OiX4wghg2FI3c5lpvfARhySaBLGAKhC0pMiNRuo4vQ3Cy6yqW0ANFIvaXYYsQcXpHl2eNN4AE/Vje+isGwOF0mz5nJUGGjUZryChcqNpaLDkWQ13EJe2PMjk3CAQBIJA3xD4//TkvjVuvtrDqFuuyb4KyxCwaRKP61A+EJEKRGXPfNhdXIHtqVkyrsAw+/BMmRYLE+HxoRZgHuy0vlBNKta1oj2qd3QFGoMaKEm8KvPV5T5cFxHBHeFZ9GtcTUJHjN103WrjunrqCQJBIAgsKQS4Nbb+7+o/jpHmOugDAqE7czQK4h74sATHCESYo0vmMkEgCASBIBAE5haBWuLqmn2jO/9bqTG3gCy5qxEtOJ56+LrKJTcS6XAQCAJBIAjMGgIj1gLP2jWnVHFid6YE0/IXEu8i7Gb560kNQSAIBIEgEAR6i0B3Q9FeNTLqTq+GI40JAkEgCASBIBAExo9A6M74MU2NQSAIBIEgEASCQK8QCN3p1XCkMUEgCASBIBAEgsD4EQjdGT+mqTEIBIEgEASCQBDoFQKhO70ajjQmCASBIBAEgkAQGD8CoTvjxzQ1BoEgEASCQBAIAr1CIHSnV8ORxgSBIBAEgkAQCALjRyB0Z/yYpsYgEASCQBAIAkGgVwiE7vRqONKYIBAEgkAQCAJBYPwIhO6MH9PUGASCQBAIAkEgCPQKgdCdXg1HGhMEgkAQCAJBIAiMH4HQnfFjmhqDQBAIAkEgCASBXiEQutOr4UhjgkAQCAJBIAgEgfEjELozfkxTYxAIAkEgCASBINArBEJ3ejUcaUwQCAJBIAgEgSAwfgRCd8aPaWoMAkEgCASBIBAEeoVA6E6vhiONCQJBIAgEgSAQBMaPQOjO+DFNjUEgCASBIBAEgkCvEAjd6dVwpDFBIAgEgSAQBILA+BEI3Rk/pqkxCASBIBAEgkAQ6BUCoTu9Go40JggEgSAQBIJAEBg/AqE748c0NQaBIBAEFgQC//znP7/61a+eeOKJX/rSl04++eSBNr/lLW+5853v/POf/3wgP1+DwEJEYOWF2Oi0OQgEgSAwMwT++te//utf/6pzr3Wta62wwgqtnr/85S///ve/W6aS2MDf//73a1zjGte85jW7JdspLfG3v/3tj3/8o9OvfvWrr7rqqsq3Q31OXHHFFQ9/+MN///vfa+TrXve6e97znq21l19++Sc+8Yk//elPl112WcucpQT0fOC28sr/M0mVueKKKxqRWbpuql1SCPzv3lpS3U5ng0AQWJoIPPjBDyZj4C6rrLLKIYccst122934xjcGxfe///373ve+DP8nP/nJhzzkIXK23nrrc845Z5NNNvnGN77xmc98xtHJEPvzn/+88847s8prrLHGWWed9bvf/e573/veZIV7la/NxJuf/OQnRxxxxHOf+9xu277+9a8/4AEPWHfddddff/1u/tlnn3366ac78WEPexiC4hBKBDTUcPPNN7/tbW+LQh177LEQRv7+8d/P7W53uy222EJJzOnTn/70lVdeudJKK8m8xz3uIROt3HHHHSG81157veENb6hrydxhhx2OP/74a1/72p/97Ge33HLLys/fIDBzBNyR+QSBILBwEfjNb35z4YUXmgov3C7MZcvf8573POpRj+K+YZIpB5/61Kfq6i94wQuOPPLIE0444ZRTTqmc9773vR6sZ5xxxsc+9jFW/KSTTpqsnewx83/JJZco8Na3vhV/kjj//PPRCNTHxwD97Gc/a6f7eu6556JWlfPTn/4UYZLGDX7xi18YUFX98Ic/lPZXSfUQmaowbnHve98bFaivY/n7xCc+cY899hioiuqj+xtttFE3f999973VrW5V2tXzn/98hzTsSU96kpzrXOc6d7jDHV75yldqIdqH7oCXXPTABz4QMapKHv/4x2OZH//4x9EXRLMyH/OYxzz72c/+7ne/+8hHPvJ5z3teZWJFT37yk/Gqj370o7vuumtl5u+CQMAvpRjJ5z73uV41OLE7NS75GwSWF4EPf/jDr3/96w866CAPbobqwAMPJB6YNFe9X/jCF/bff3/Wa3kvM3T+LrvssvbaazOKQ0eSMQECd7zjHekKlJsHPehBHE8ex60QDG9961vf/e53r5wNN9xQ4oY3vOH222+/33773ec+9/na177WCrfEd77znUc84hGGm42X6aynPOUpEjxB22yzzVZbbeVEA+RmkPnLX/7y5S9/OSbh60Mf+tDyIrkx3v/+9zt68cUX3/zmN6c/kYgY/nXWWefNb34z1vXYxz6Wp0kBH7pIdaG+juXvTW5yE9RkoCp0UFPxsJb/ox/9SObBBx9MEEJ6EEGH3vnOd9Jg+AGxHEoYHon3oGs0IXwIYn4XulkBQE6h6DgKlp122qlqRgc1QMm73OUuuvmVr3yFqOauhsa3v/1tVO9973tfs6CtMUkEgekiEGfWdBFL+QWAwEte8hIzck9wz1aP42m12IP1Fa94xY1udCNmjJ177WtfO8XT6fkf/OAHWcdb3OIWu+22G3eGNmy66aZl/Dy4UR/tWW+99aZY4RSLdQ32FE9ZysXQhV//+tf0FZaVMmG8Cg02+F3vetf1r399zIa3ReRKlwHc7GY3g7MhHvaqEIS4bzCnqofXphw3e++9N3b79v9+jDsipQChAiFg9Zl2BegZ7jeGX3scdZU999yTeIONaRv/Dva82mqr4VJHHXXUC1/4QmXoKCKI61rj+qtV73jHO4ZrK4dUy0dE/vCHP+B2cr785S9DSQJK8CTwKEzQuutd71rlffWRFvn0rW99S5f9Lt72trf5afhN6RR6RKMi9kBe2FN1WQ6JiMzmKEVt44035tXCfjC/qjZ/g8CMEVhxxmfmxAWNwJve9CZy8UAXzDg90AcyF9xX8QH8/SaUnpJ3u9vdptV+8Qoe3K9+9asZKc/xH/zgB1M/3VkKM2Ye6BLsh8f9q171KmlhDawd02Xi3q2QB6psBi+GMnXIfFoUxQEHHFARtSbHBksx3panP/3pnC9V7LzzzhPrwDZLdOtMejQCvE54ZwknNJUKzsUkLrjgAn4o6sW73/3uww47zMAp2aq66U1ver3rXa+NUcsfSNAhDIrP5z//eYfEAHGc4bsbbLCBe1IOlxa+ReyRZv65sSSYfFJQVU7dmVBDUmxuPiQcqkzdliOuqAu3uc1tFNDsNddcE0Fxw1vkdeihhw6f5YfgVpev+yQ0rAjbe+Mb31iaDeqD4ZkYoEomDD/+8Y/9Bm9/+9ufeuqpqA/FyE/AiAxXm5wgMD0EeuVa61tjGB5TN455Suzct83jgyw8S35rDxFu+Be96EXiDEzU9M6Dxq3DzM9lT2keQkRd0RPwuOOOM8etqwPcVzr2jBvDwOgOp9K0ahAowBDWWb/97W/NO6d+uke5K97vfvdzCp5EomfPzjzzTF9f9rKXMWOCGLq13ete91KYR2P11VdHy5hbR4VKyBcVaxKP8aBufC7sLnuA2ey+++4sh8pdixKAP1100UUCL1yXmNStPOnJEBAOgk0y56Q72EJYSWaVR0nCjWc4bnnLWypg7ABb96RfijSeNFwtP46xeOpTn+oQ8YMDiGJ06aWX+krSGPj9imJBo90bjopNMfoSKLLKWXe3HFZ0pzvdSWYRptNOO01aJVooUR/3J6Hlqm/L+99Nrl/QcEVsD49xv3n0aTyuhuf5JfpduNPwb0TEQ0kHcZTXvOY1ro3HPOEJT5B53etel6yFMsqkRekRGueuRuUdpWPJf8YznvG4xz1Ox7n5UEwVyhQsBQeZ4odc1KVlQpKmpQHkLvf28jwK1JbPXCJQLNYN0LfYnavNJQoL7lrUZmPmY8VBLVKdyy6Y07NtHkDTvaj5loc4fZge7iNBbO9WYhLmgW6FhYeL3m277baKmV2x9K0YMd9zUNSk2mR6EKvTpzLrkSSf1KzkPvvsY67WzqWsUDjYEhNl+VWYASDaM/kmglXSPI/PSEk2plw8hx9+eB0iemsYp1Krc7qJmdEdV8E2KDGev1wJ07qox7pnuhmqXjsdYt3wT6QEZelWyEbyShgaoj2DUYf4v775zW9KUx2YlsoUFFKGlp0DCxcJqup0FkiBolkChqpw/o5GgGNF7Iu70XorQgvDrDynCfFMwu3NbAP56KOPRjUk+DRpMNgncjlZzSJslBSjQ3sTlSJsRUkqxVprrWXsOGjIPEiqTPIhEuD2oPP5daP7Mv1e3HUmHoJ2UQ1iibb5kaoTyxHF4nT74tTV0Qts2OjX1+X/65az/Jt2Jcr4Qx/60Ec+8hG6r2q/+MUvAkFYMQ3MLV103I9dAR8EyG+/rs7xRIBRWJcrR0IZNTskR7W67xGKRLrhfTxwdMRRjxT50pUvXZn+IoU+fkcVu/2roY9AqKq/Lpq/PUGgt3QnsTseKRN//MKtiiQyM3uEFr9zj7xuUdNEDyOzMebHk8vkDDMoY1nFPMVKLWDj5bD6vkqIhaQY+xlXcIBnrh+/+T3zX795VtMvuTReMky7qOmmiyrsciZYJmQS7JwnuHmSerAHnnU8RrykYAITI88X8y3W0VSs1SP4kYrgoe9EmR7x97///V2xwgvkWKmrzbQHDyPzMLNh0gKa4pAnL1MBDdf1lVBPe1fSUQ0we5OJJopexJ+oEQDBAGjXuI4eeZTTpQFlMmeNq4mdQ2ZyZrQWo5oiO91HS3AFFqi+zuVf82ZD70n9nOc8x+ND96d4dd03WGwknQwFYQAQoBHnClaAM0DY1zbKGIzxMjpuKphMdjoASUdm3pMVSP5kCFgExMvpJ2agySrQhj8BA1nhjhRoYsUQeI0dwuGmpXf6xTn0rGc9a7I63fZIjJkJk69mt66SajBNUqEfdTtxs80280N75jOfiWGw5UWtBAxRWD1n3EJ+R2iEfPzGV5zDD9lX9VQlvjq9tJ9W7VQSKJSOdze2cZYHiN6JPpYma7miex7n8BOoXXBAoWuu7mcOLo8sNfjoHc+dwk6U6dmoKpBqthwFnO6o34IGi7x+9KMfrXKHpvJxUUzRA0FiRHkPN4873HFEmRwKAv9DwB2ZzzACnlCm1ywK0+6pVO4e1hcp8chQni2khWADMk3LiBPl6WfaPSwUOOaYY0zyzJkcZTvl0DDQEdCTiz2/8BUfOR4NDL8LldubffW8M6FEUDxEhAG25vGpU2VQBBNNps6UyCG8gV30lPEcV63W4lUccB5eHqMIFkWR7WyVcJFoQ622RdE8qmgzcmjLntqeIEriNx4iaBA+ZO8NOUWMPKz1zooJa1twJvnQMKN1UW3Ti7qKhSceVficakt10B5pOj/i5UEGpSpZZTwl6+ty/vUIVmFVMjN1B40joXvIqsROKgSqaTUJWXHPYHX6qL9oXzudVOPmaV8lymKZBGNIpeobLDdSuaVwYo9yE1ol1QY6iVJ3UElTbR3Ewt1UJAHXGu/K5G47F1naPe9uhPlAv8xAEA70gvkfODQvX90eCIR21vNk+dvg+YC11GOn+9cjyLPCp5s5YdrTzA8fq+OSMw+UmOLHI0JUwNQ/Bqg0zuXvdWqYewSi7nggL6SP54JFxX7VngLmUjVds1qS1SdF6AlDaErkecQ+oS90Fz9mhoo7xoOApTQppDlz5ViBjDGY65sp4jQ0FfMe1osPuxDhsrFIBGNAERg/E0QF0BrMgPnEXRpwqImvntcmTDiKv4QcNk/gi3WhCBAKIgSBfKJ5LCh1ihVsp0uwl7iXFShtta0fA4JFo9KkigJRjNij4+a7pmUaLIf8oFP0KgzMx0SNXi1fnKzAFEDhT9ojx8d0EMsxIUZrqO5yXMVfpytjVW0FOf6n6Fg/hkZfYGhEGC2E1VO7XYEfAe1rfWz53YRJLbpjWHFQ8juO0o76DRtinoVan9zyuwlDoIMvfelLFdMSY1FHVesOcfPIx02FYmBF7itHSXoGRc2uZbAQa9v2W4TMJ4K9uRkMtMYUPWVvBJGIY+AKMY48XCJLIFycu9uSeU+7/fwW+BrctAONMTSODmQOfFUGDpSSgfzl/0p0nLASLkWfCQ/NS6YnAD3pAx/4gEnIWBpmNlUTquXvTm3Js/z1pIYgMJcIxJk1KdoIAbc9174YGpMYBomVMkMqy82iO5O+YuMsTIUGw2vjQ/Jlbnl8TM7MvE2GCAZK4ijojgT5BNFpXEcOl5PZj/L0IcE0VbP84Q8FhbMfmxGOw6LjDaUPc52gO56JBCc2cvjElsMrj6BUSypTdxAazMZXaVdHcdAXQYJUBHaUBt5O7yZwMmRLVXQs/WWh+fhBgWFolbmyeAgLwkkdzrrBDW6AB6ifKkauBylkXJS1dlRhZ+FA6qxLwNDyFkJUc291Lz1Z2jxVJJARKRXEOibMrBXGV7SHm0wMTcscSDidTmbtOoEHzyjfXJUxNJilu2IE3VESX9Flupobpvou0zpkwhgOTbdzz4jnQGQRZWUIhHqtZlwHIUOy0SA5htKJ2C2Bx0xXVWK6tcqd46/bAAfiSQQpcqYkAuF2lZjiR2OQ7FpKg5syq3LauRop7W9V3vKnntAwg6upw6eYNuiOAsOHWg6gADIbdKddoucJP+dxsZOe9zTNCwJzg0DozqQ4swTiKhhIgSl88GbqfNVKdx/Tw5PUZn4kOFZ4iCRMcGvzlbpY1dBKogLMP8cHcYgxK5NfR/31qYsyPKb4uII1PtgAOmKtrAeitiElDtkult2qR6RZNXpBG+AQwWDK6rM9eoEk0XKqJdQgCf1CVtAappqB1192jrpDHBJjqHl64XKsKevLhtGo8AmCMzkKa0EBpcVAQEPzVOVCWAXdok5RjFahKgZM8+xcohlFsEgpGsAf5y81BQOohqFNzDxvWn2d+l86HOcORghww9cGyBCIlSFr4Totc7haS0iwNwV8Sksr/JVUs55CpvirHPxAICee2mWo0rqPdMJfok6X2eUNMkXtYMPGvUoKycJm6HkOOdHV5WONcOa3IrZhRRDGlYfbvJw5QlWGa6joddct3jNcYCo5A72uU8g2SAy/51RqSJkgEASCwLgQWKE9zcdV4+Koh9sFq2CVzcv5IMSyUCDMt5hnFp3VZJ8YJ14qEoj1NRw6nAsEAKdQRwjRPBoy/VUPBoCaMOrEFQK1WEVTczpHGftCTCgfkcY8vr7aiUt0MItbkoAZv01x1Ca8gOsKe2AL+cuIIgwhIcRZJJ+nPe1pTBQb6VykxBTZ1QUP1V55WBFCgxjVJfwlhPBJcfpwPInWtDSDVUabWH3VKoAYMb3qET4CEzk4BIvFbSSfR4btl2YUlXQvuZYuEHvEHLB29CosyiXoUsoL+vFXgJFPtQGtEZpQVMAlwFL5eBWyiGk1gaTyR//VGNt1oHrOUq1rtfLaBkn1y2+ZEybwDPqNAGqCHDJX/ZqwpNGh6mmnxIQFJsusxhhBwFYZOeKx4Anb7lnK+GA5LgHMLmfqFht7mjoFvWmBP/Y2pMIgEAQWIgKsjLmllgsbrWU6PelF6M6kA4GXEAkYSPYP2zARV5S2ISyGbsFTw91ACTC0zLklSCblbBWRBi9REikReYphsJ0V3SL8WQRPE1oYMJ6gZuCRKta67hKnm+uTRjAGhlC1loCJOaiFFZiWNVld18OkfegcEDtiHs9/QWLpZI8/afmGyBJN5bJB70Q4iUrpeoXGf8mragQUtyPutTy0wHALORIIDOdaeXdV9fkfBIJAEAgCy0AgdGcZAPXwMLOHoDCcCAcHU+MlZuScFL7iKywrIQS/QYY4HWgSJSFUd8yPaQlKln9qfvsoQkhoiNAiDZ7tlnC74Gp6TXCiBtUy19m+aOoPAkEgCASBeUcgdGfeh2C2GsDFQwwQh2Eljrjm7lKg2brkjOrFPHwEOM/o7JwUBIJAEAgCQWDZCPSW7iRUedmDN7qE0Ao+KW4UcTy95Tq6YA2zz+i+5GgQCAJBIAgEgUWJQOjO8g6rqNvuhsXLW13ODwJBIAgEgSAQBMaNwPRWlIz76qkvCASBIBAEgkAQCAKzjkDozqxDnAsEgSAQBIJAEAgC84tA6M784p+rB4EgEASCQBAIArOOQOjOrEOcCwSBIBAEgkAQCALzi0Dozvzin6sHgSAQBIJAEAgCs45A6M6sQ5wLBIEgEASCQBAIAvOLQOjO/OKfqweBIBAEgkAQCAKzjkDozqxDnAsEgSAQBIJAEAgC84tA6M784p+rB4EgEASCQBAIArOOQOjOrEOcCwSBIBAEgkAQCALzi0Dozvzin6sHgSAQBIJAEAgCs45A6M6sQ5wLBIEgEASCQBAIAvOLQOjO/OKfqweBIBAEgkAQCAKzjkDozqxDnAsEgSAQBIJAEAgC84tA6M784p+rB4EgEASCQBAIArOOQOjOrEOcCwSBIBAEgkAQCALzi0Dozvzin6sHgSAQBIJAEAgCs45A6M6sQ5wLBIEgEASCQBAIAvOLQOjO/OKfqweBIBAEgkAQCAKzjkDozqxDnAsEgSAQBJYUAv/85z//9a9/9aTLX/7yl1/72tf2pDFpxjwiELozj+Dn0kEgCASBRYjA17/+9Vve8pZvfOMbp963N73pTTe72c0uv/zy7im/+c1vfvWrX3VzpC+55JKLL754IHOyr3/5y1/23Xffd77znZMVSP7SQSB0Z+mMdXoaBILA/CPw9re/fdttt91pp50+8IEPdFvz73//+6Uvfenhhx/ezVyg6Xvd617Xv/718ZKpt3/11Vd/5Stfueqqq7ZT/vGPf+y2225f+MIX5Fx55ZWV//e//33XXXf97Gc/C66//vWv3cJ//vOf29dKqOFa17rWDjvs8Le//W3gUL4uQQRCd5bgoKfLQWBhI/D973//He94x/vf//6vfvWrC64nTK/P8ccfTwLpNl6nDjvssEMOOWRYz+gWWyjplVde+ayzzvroRz+K1elvNRt3ec973vO5z30OWZH/iU98Qv7vf/97mWefffZ5553nrCpJ5nnIQx6i5De+8Y0PfehD97nPfU455RSHnve8533+859/8pOfvOKKK2688cZV+Mwzz3zWs5615pprHnHEEb/97W8r83vf+94zn/nMFVZY4fnPf/51rnOdyszfpYxA6M5SHv30fZ4R8ND/5S9/2Z2kznODJrk8E/KABzzg8Y9//GWXXTZJkTnNvva1r/2HP/zBLJ8eMKcXHsfF2GBW/G53u1tTLNTKO7PWWmvd4x73oP3c5CY3addxb/zoRz/6yU9+wq1TmWJi8CHM4Pzzz6deyFTPy172Ml+/+c1vbrrppk972tNIIFX46KOP3myzze5+97vvscceLfNd73qXq2+xxRbnnHNOuxCiQJJ58IMfzPXTMidMKPmoRz3qSU96khq+9d+PE9/73vcq7O/mm2/uqDSd5lOf+tQb3vAGnGaTTTZRUOaLXvQiJ2rn9ttvf+qppz7lKU/54x//iOKQgtCX173udX/605/qorqmI2jKNa5xDX+f+9znrrbaag45/epXv7qvmNPb3va2KuwquilG54c//KE6ZZ577rnYko7LfOITn9j/n1h1JH9nFwEP3HyCQBCYewSYjZe//OWe3c94xjPm/uqTXdFUm4UYOPqLX/zCY4i9oaYMHJrHrwzbPe95zxk3gDH+yEc+0j395z//Of7kg3NgFd1D3bRDCij2ile84mc/+5lDOOtrXvOagw8++NWvfvVBBx105JFHdstPmL73ve/NDLdDKMgGG2xw05ve9DnPeU7LlNh5550f+chHIhZ3utOdKv/jH/+4sXjc4x6HZLioTNRHoIxMDrI999wTe3jrW98qvyjIjjvuqE6Z2izzwgsvRD722WcfBGjDDTf8wQ9+IPOnP/3pXe5yF9rSxz72sfXWW0+FMif8IFUuRDhxdLvttqPQSGg50UUC8ZJ5vetdTxrvURJXk37Qgx709Kc/XeIOd7gD3BAdoovCuvDUpz5Vvs8JJ5yw0kor4dP11V/xzqqi4bUcCaQN+6H3tEzyjwuBQvsRpvvf//4OfeUrX5GJpksbl3XWWaeVT2K2ETjppJOA71My3mxfbur1R92pccnfxYaAed53v/vdNqPtYfeue93rCtTQQka3qf3L306miDGbWcd5EBAacZ2s1BlnnEFU0B7z74suuogJR4O23HLL1kKWTxmfSy+9VCbjxKfgRBEb/A7SpT208gMJlZ922mlKcnl4YJmOs4IcOqeffvpd73rX17/+9b/+9a/rFCyEwaZ5cGB1K2Ey21fl2U42++STT9YGMommVvuPO+445pYqgDeUoKLBHsS4JhtJiiisNJvkwF7qvtpGLCxC/g488EBsVXvQDjoTjviSl7yEdHHMMce4hGs14aG1cHSCUsVI3OY2t+nG6tIwoOr2QHHud7/7VQ3ve9/70JQXvOAFAn0+85nPaLaz7njHOyrwwQ9+kMnHZqpH5enDgZ797GeTWB796EerYf/991977bWRD9oJrva1r31NJrqDTiEKaAHqNuBl67Z8lVVWuf3tb889ZNxRHJqKoxheuZAc1ZHuvVdjRBfUbCV9rZyq85rXvKYrduufcfqAAw4QpuN3xKulEtT8Rje6UQ0iyUqXATXjynPiIkFg6swoJYPAMAKeYiaUnvVXXHHFwFH2wCOVbRjIH/vXD3/4w8N1snx+onNw9eFLTz2Habzhfz8SI85CCCgZjBlNnukiKpT24Ovee+/94he/uJ1rXl724xGPeATRiPGuQ2Ij1GAmbQ4tx+TY0DBXxk4N6qx5MC2h+1yjQFThyvzOd77TLoTNNJ8LV4J8fEgxAadbbbXVQx/6UDN1PpdWfiDBtcEgsdAuXXP6T37ykw984APVwOiq8Ha3ux12UmexxNtss42m8o90Z/+601V30IWa2TsLv9GXOt16H0uEpCGAl0jouNujjooaPuqoo6Rf+MIXunqlv/SlLzGcVWDgL6upbUJJKh+32GuvvaRJNQWXNAEDHRk4ceDrgLpTR3WnqR1ykJJhDMtVVCNy4xvfGEFUsttfoS0CgGSim6SdVhL7lwkibJLggbFhdVimzK233pp7y9fKNFWQOeLz7ne/m45FMSrpxV3nKlUVfQjtcG6pOyUUud9IMjLRsle96lWl7nhiuEsF99SFRqs7CLc7RElc6gY3uIF+SWOrn/70p/m/7nznOxPGCFcItFHm53IUkvirvljzpXnor5J1rfydVQSi7tSPLn/HgICftF+7OfcY6lruKkyg3/zmN3voMACe4N36mGdkaIy6RbfyljZDFaDQvrbEfvvtR9Bm6lrOcMJjt57U2AML1+Z/OqVHBIDuPHX4dDmEBBN6H4ykFfDc51xgivCDlikSExExnbVOxIO+5U8lQUXgmMAn2GYJDdNaxsy6FX1kJ5AMXglVMedEEZdGX0RIVHAGwYCvRANwFAZGmIgyhAE2DwLIhJGiUjhdjvk6g0r/uOCCC/BImdwTTtSjrlrDi6EXjBA1Ba/1gKMiMPPUCO2Enm4iKJP1jvsDHWGZRFc4Bc66JjxIy1k1BIIBa3fOwx/+cM1TABsoeWbCat1+BDOhJ6yaQBCSQxUTzkKNcCuiUxX/gTax/WussQYBgPZTygTZgzJB8jFGhCtOxgmvQmoy6JMdrVPICRow4enTynRDUowGTrGyGhTA12x+HF0eKNC+svTGAkGkVxl0zw2HoC2kF8v0A6GHlXOKYIMuIL4ySxWb7M7XtV122UV5v/cnPOEJ5S+DKvkEzTJ2SLZfk5sNi3W5L37xi8cee6zIYrKTr5gQhMuD5oahKrm3hSU5URmnuLcf9rCHkY4UhiEShkcaJl4wspNM4hBuR2jUcb3TL0Rq9913F86FwqocQcTnlKR+/e53vzMZIFa5c1AfJeXns2QR+L8w+CXb/9EdZ0rN+TxZcGFzVk/h0eXHftQz2m/bc7lbsxmVKRR92JS3mz/FtL6wGd0Fn1M8ccJi9HZ2xTSLy59Z8ij0XPN4YnKI/B6It7rVrepE17VS1NOWJsGiePCxGTJZNZaGQT300ENve9vblp2wbkX8gbR5uRpYbo8t1sjT2eOMGeDd8LAWjsDW/vjHPxb+yRirVktchdrkMaphyrg6oy7fUc9lj1Rtc7pLO1rtZ+8f+9jHYi3YgIcvYD1V+Qhc3WS0hKLh7msMw4NzeNS6N7QZdXBpE1btgbAHujq5HtACrgQzfvSLfuD5e9/73leFrqIYQIYr7+ZABpgUGlfBSHAFR1kXPg7ylVhOF9IpmcwDyURPeUBaDUwIVmRoIMNYEofe8pa3aImWIyscEMxPLfcl2GiP7hhBn6pBjiuaK7d2uhb9QD1spA8YjSC3CNZlxr/uuus6USVlSlszugl2WiVwoBNoQDukLxrfvkpgTpqnKg12CQW6RwfSfq0QgAx468ZzV/AYso7uJfcDAQCGMv2mDK470G/BoKsH76FYIIJMtZLKEMAG6veVpYdwQXHiiSe6bzEA+YbSKW4qCffk6Ha6511XJUAwKPhHDZ9zixy4kdQAczcSO+1Xo2YtJ8OIuXH3GjtAGSOOM1c3RkaZH4eSgdx8+9vf1kHNc6IoYPxD9A9dR0lER4UYrcFCx1FSzMb94Mfl9+sGQARdVMkJPyZaWuhewoZph1xIigmyhp57CXp+aJqKY2Ebbk499YMi19XV3TN+vBqDJ7muu0XbkBInYiq+4uukwcZ0FbPm3CWQGwRawu/X9IafzqRCg11apl4g5TL9yopmyUSdISCRTxD4PwTcmvlMhoDnGphMCv1y/AgnKzZL+R4ZJqzdoDwX8lxg/NiAgYuyBMIFmu9GSITHn0cJbsTwd90QvPiqLefFQCUz+Eo5YDNIBc6tlaIsiuc+f4EwQ/PFVieD7RHsIethiuvQhBzypPMgM/fyLPPIrmhK0gJjaZaJTFhrit/wfWgzY4CUqMeImM85nc0w/aWa6I5ipnq67Oq+mm6yXnV1D31GwlAye+w9a+T56BBpnRgg4cnr0hV3ia6VJwKAThHQWpUM/8VjypHEbnnOmpIqw9hbJiOhnZDBAqU1iT1jEpgifUc+ZNbHReXoyFUZE/9nZvS6e4xR1zwWkZHj9aiYWQXcqCX1V2FdY00hTE1h5B7zmMewQw6V06Gu23XuwLkbQtuuiPa1Zrszda3Bqw3qVxKwhrhO6Tp3WiWVMBz4mQ9YimbV/YzJGcoqowuMGQz1sRpc/S1nXJUxpje/+c1RyfpafyHslBaDbCzwHvc8/YDjqfxu+u5ecg9gqLgaHulcxFdtDmEValBPt9pumjHmaFPGB6R64ajafK3uu3WlS8PonlhpdwJNy3TFp2Jo5PidIi5ELB3HPModo7z+mvAYU/W3m0Sz8R6/dydWnaibMviZr35ffk1+huZpfhQ+dBG/0yrpB+Isj5FupkM65eczkFmndP8aO9WiaO4BVXUPJR0ECoHeOrOulhEagQBby4wpIPbThEaCUanHqx88P4gpDvMp38NCmu+AH4TYUI8hmXRvqq85Cg28POKmL+Yx5ojmKO1pxep7prsWDaniLRCImg9RcWn+noCqdaHacKJrz2R6Snqm4w2iCjymXYIgYVbHoiBqJFxTcM1T0gdRYKvQo/q6nH+7dMe80FO+2V1uEc/oVr9WmXgVP2DjCdEOmRx7TBeGrFFlsjeIZj3cN9poI6TEE9YDWuV0Gmdx5TBXKpRmdG9961tLDHxkNnvsUCkNFRXL8pk6yzS+sK0T2Rsfac1Yf/31tZwkjpw1G1/Fun9pUWar3Rzpdl3No1uU/SZjsIv0CfIDEExD21ld2tEyhxNGHFDdfIaQYqFfgEIQmbc66mZQ0pTaiNegk15IZTiZ2wwhJl0oSRkCgnuYASPCAbnMdk39VUjkcKKSOAcriAJiBip0J8vktzIcuu/RRgYzIjJ9xXFN36XdZsYO2ZIe/iBYjrpW3aLuWx2BP5LK6vtxETBk+kFB7Ba3uAVGVVN5PwdtcDnyBo5lgPxkSETtEq5uBbKeVo65gSaJj8Ipxaa0YgKiURYAVpiLfL8vfJQa5/boDlA7pZsAL85BeGgXQm7kkBMUU7lDjWF0T5T2WIDtQGa+BoFFg0DozoIcSl5/Szzwkmq9J6lHvKewr8yGJ6+vHpG+4ijS/NYyPZ09u2WiNZ7Inte4iEl/8RgFBPExJKaSptpVM8PJlrA3Hr5tbu2Zrk6Ts5pOMUsKe1aK2sEk8Ko6119zShNrCfaGmE/bUJjQXWIJ/qEeXKSVH2Ni6nSHuWUjES/dN8mmt2tGN8SytQrXYczqK7ah8SxugT9GumNuWja7ZuHEjxLSOG5M9OvqQBthmRSjheiXQSF+mGE7i6BlKMkV5U4yfZdpfGkzyBa2pzvIK+VfPsWCycc+ucBM1uuiw3+xDSqRD08B94FbQhlWH89QW/liqFx1IppFG5PvU3yFIa+FynLcHqWrETz+U+JqV2OYMQYJKqAadEcMja/usWoSBNxggmlcAkcBWl2Ik8gNT0soG4+zivwgNbH6Cqif2IDOVuHhv+pBuUohwIqKbLlvcZ3hTLeZX4FKqqS/copLya9DjmIhiD6VaPhyyQkCQWBuEAjdmRucx3wVEQZkGPN10/3aasIEUbouw8KxCqXDlx4g32y1zcLZJ55ymQySuakEIsI7zkKwVSXeiB+Ub+7OdJlusq/iVOT4kATUX3Plyqm/7AHS0CagrLUrslKOImFOKSZhRlvLTOaM7nDqs4tkmGonWsOWt5YLS+RuZ3opCtwrNatmEYU+mMRrs0/1nYIlXFFhaLCyTDV7VtuUlZOCGdbNMqU8EQbIUUyIVGO2jXwYOBTTtN5MveoszxT7qj0yna5+NIKqAT1MFA1FSggMXHLsvQHVMAmxlq0LwwkihFFDzjSyfCKGA7XFFXBlXKdMMkrhRlKhSxCNaBiiJQDFqeEmkeP0AV9V91qa5ES+MzcJuagxMCPLTanj/tYN4Cy8wWod0HGutUzgK+NTVEkxpFkBOcpoCUHCzVkXRRp8beMo05CVqAOxVmcV7slfrfJ7ccMbUNJUT1qVZgSBJYhA6M4CHnRPf7aQrao+8LgzqNK1bqViO0y7WTg+LyVbtA0hnVllCz2C+SycUi/mxX4YbAbPBBfvqWqFlYgaYc/aAtqiO2V1StqpkrzmHC7cZPWVCMS+4gq+8lOwoLQQaXTHehyJojsVWeIrtsRb1E6XszyfCpUAjkAKfWcm1cb24AE4me4LI6ABsPq8gb7qHZGAZ6G0EP0iccmvD3okx+Se66dyiCXIkDqREjnFNblCpJWplhsIwY86BXAXgg8mik4hMRAuXiUYWWYxEtqbBbclQqiB2oEtSSAlzZYbC1Sg6s/f/iNA0yJtdr1a/W/z/LawJLTxtsFTbiCUarz1V21a3n0ezsYlUufyIBC6szzozdu5bG0xG5SFh6Xa0fR80QmMbjmzah0yFkLDb821QkcB7oMWWUylMDsnQpR3n9Jjam7ebO4uZpZcwWzLNBdXiVgfO0xYY8Kbhh8oKVPEhsAX7g9ra12rQgesJBKpQzRyuXJAqEooD6cGbcOMXD4fXFWLrvnqjlTb8n/wCb4b3hBiRouDwajoIoKW+KqEbfpQDhA7/IPugpMJimyRDRIQoEv5O5xZ6oh2VnhKkT8wClVpaoSjGF4rufydSg1BYHEjYMmn58B4fzImWh4sZnSzDZ1nIO22eTBn+3Kpf7oI9JburKAn7tF8JkSAtTaNEOrLSIvSKDmHQsDGi3IlkPAa8F55dpAHqDu8MMI/RdUIYhD0w0nBBYYEWHOEl6iHoiMYhdUXiczVIkH8YMKpI+y9WQt+gxs5t9ojoFgIsweTYRKAKTDT/Ilpt6Dar12MKk+KTA4vjwBeDIqO1UzCeLWnQlI8FzyGuJnoFtiYBa6iRsgbusajNGGvkxkE+oOAm1xUkPag+CP2TfHboepVs/1YlFR+jnvhuiKrzHz8rgmu47q67ltz7kc9gwoRmgqxEpTdPd0MhPjqwdXdBaBbYLpp00KqtqZ6WHnctdPbAq4RI2LFhsesUwCoHp01VasajL6Hrcke7dajVabVDGZ0tGpPMIVdrisPt+smMY8I8FRYOagBQgj46OexJYOXdoflMxkCXB5ibjASq126AQGcRH5+fttkDIIEb4v4VjQFpeD5QoyswaGv0GxkKuZH6ycqdrgu5KtfuHPbdZVRj08301EP8UxiGkpJLFwEOFtxfQvERsdCDXdQVHs9s1pI3HAZORg8ftP27GnB5hMWXv5MSqRpz4A64jlgTiIOffnrbzXolzo9MVrO1BM4hAeXxw6PM/AFqznXE8ZXHt5yf1dtZlAVRChKzM493WUQ5leeXYKi/GXGJry65x6BGR3x3Os+JwWQ0apr+DigJzxXpqkjNoYkUaMF/JmhVUmNMa8Te24/Ho7yyizvOTmZW1yEnynoZNUmf74Q6K268x9Cnc9yImBVlJ+0hS3iQAktIkg8o62KEojjq9AZkZ74kFDT5bxQTu8VAowQw8Ag8dPNbzABOaECtymOFQ2mYayvr13zM1/omYsLJ2d3TdMFrk2rGbyWNE6GjZ939Il4hhhwPt9iSEK1Wnk/wxajXZlYF5NJnQUXHtCmGUr6tZp+VDHo8QWbddBsOJfLd+yQn7wNBVyLfFs1K+NEOd3NF2T6+QstZ/hrXExpnOKeMUAC/oTey2ntHEigUxY0EGDsFOBjzWZtvmC6BRBURrXtFHFLtF7L+Hnb680MvL3kXrKQBQSc3SWHmI+hIOqkmtS5iA4qaaGoS9CVsRaydM2ykAnisY9K9Neyu3a5bkIXbBSElygGwHaIw53KBV4eru5wtAKVoD070ek0ac3znKzAPrKNQTcztKOEB2wtPoUJoajFRw5Ula99QCB0pw+jMIttsJtZTWL8xXIqSsbq4pYpdMaDbxZbkKrnHAF2q8bXHJRoP+fX/78Lsg0iuNkJ32uPXZqieC9OTA2rWfuEbWPjxVHxok54dFyZeBibp0kqZK1rf8iqnOmyz83AhTwrxbkPxInjEJzFAyUHvlp13zQApr2F/AsUs1aAZiAmvZa7Uym6fjGr/yomjEE1LTGmtV+f1hIeKBZQUjmrT9OtixbO2ulDsZeJ7zLqPERYSGsYQoxn2PDX6drAJ4481WvCaMZ2JWDRBec1dtVObAmuKFtrajb+URTEJl5W+dU2S1wGNb6YjWaX+oJYWOVQNbg3tL/S0Ea/Kk04qc0w6yvu5fRaw2hzBMym7mfytjYrw/Neu1JV+Qn/Dq8ktQcH0LRneJQHauiOr4GGKnKpSe3m5Igv8EN3BqDr4dfQnR4Oyjib5ElkpmgK6OFO6K6qabzDmeO86iKqy3PNYih2ZQH1yaCbuGMVdPj5ojvcEJwdpRyArtbby2F9mfC2eI29ZLdIGs1+aDAxgEVBAoRzmZ2zxGpgoT2tfG111oiImSAbzGDdDQeWvS6rEs2oGjAJTn27UpnTW4rfVArMAJjoETWo65DqmsOqavivKDemHQfFWpzOVFcZUfPaQKchFdAbZFIglCEFkRP8Zqvj/Ds0BhoJCQdVss5LSQVso0WvlZZpNUDVaUeDVn/l1N8uzZJjb0beGRZa2iaiOBb8Iaydtc+nqxsCYk+3km6aSEz86OYIziP5WFeo/c7le3KUBGJCpX4CjKA98NYpXbrTrWSA7miVqsq9JQqw0R2/RzwDi0Lj5HdrGE4P0x182koO0z+CjQUNRn/4rMrpji+6Y42IQRHIWARO71Ri3avCwEShED4JSNqPY4Q8Ntnlkj+rCPSW7vx/8Wvu+HxmhgAFWBSwSbbZIVm4KvHgHs6cWf2L/izWiC/AfJdKMd3O2s7RGnJneZ6OPhcJUMbHr72VFPDI9pdNkqlMhVIpJt9RHw9cXz1eBTFoap1r0E3ZXd25rTYJJT2vrcz3uG/5ysi0E494dqe0/OVJaKodmzz6LferekzfhbCIhDBHF4reYlmYPVdHd8RYoJUKs4jMW62mMa0nElS/hKQgH7ZH0lOEqTUPCSgVpOXMIMGOMpxOtLrQBgpaxZ4hGU0HtVsB1sIHZz0jsz2tS+g15Nl7iwMILZYLOB0tIF0QVlEWXic0hR3l3EFi0Cm23G+24kscZUeRPOfy9eiv040vbkGPkUaArDNAlaTdPy0qxdf28ZPXnfbVtfjvhJ7IEcXMMKPI7iKjgOq1YiMSziXUVQGDK6FtJlTUHdoJqgoomYgO/YyOosEUKfAO1KnBwGmZqnLztK+VaDkKV1pfqCyUNnwRaxwoP/C1TvFL0U13O1HN3W4QjaZtpRC77o+ue66SuBTfq/Wq8v2sqEFGyv5YfkREHSTY6LjVHQWIdbLqNHy8b1CdrNruJZIOAv9BwL2STxCYPQTID7UubMQlzHSZZ/NUNyS6wx7Uw9EpLLRVbAhHnW4yJ9zBo7AkNE/Y2g6ADWBXWKARKgsPY7150ROzfCueyxbzi+Hw3CTX1048Is1FMPA1oAserMIaSCYi1n3levDVhWq6X03qzoblsGcW61mWrx5GqNwKLkRKwS14BGrjxDp3Of/qLALRnXbDigNLp9BuGJbnSAGZjIrL0SdaDFl39+pqiU2kLIrRX1KQIBtdaC3UBePSvk490VV3jGPFGHVVB3pP83BpuRAf03p3QpOmXKs7+5/s0mx8xfdQbpRnp5UsZompsJcQQOkMkHw0CyZNBZHDfKI7LC4moUmkWZloAX5W8bA4k1aVEEW00MhyQrHoLabH+KKzTqyPmCpxMHXUvgwmQtgSENxs5QJbprpjx2pcgfl3c9qVStsoMUJ6/ViQVDDWGMHK6k6/DhNrDcNoqwGu4g5xG4sTMrL66+p+j7x+qAy2JK2P+qtr/H0SNTXnrVMD5icf28OivIxzhGO0wmvcdYaS9802V7gySkfuUpWfWHsL2P9B0/mHW/PK6YiHgGw8lSRWTjTjhQq7ZyTaGUZQO82OnIhdtfwkeoJAb9Wd0J2e3CELtRnmr2bkAifFUdpO0MNUT4RSMvlmZjwgtRCxAhWRDIW9/0hmcQuFPXA9iGkSWAjWwoQLomymjqfD87q22GF1FHCIWbJAw7nkE0bOg9IhT1irSEqkGUYT6yqe4afoeYowKeOJqXKNl/ZQrvhHa1j4KfhENMnDlFTA8nFaee6X08F0UwQoplVXGaA7jJDeedYzcjiTObFiOkUkEJGgweJ2W9+rhhn/nZDuaDYDpg3In78qN0s2Yy77rXmkhbqiJhmm7tVrp6gaDiYZyKrqFphBGt2hphRVwmUxTnKLkBQerhosaNfu2+4EPguMjcKEk3HJ1eXIEkaHEoNeTGbelDHE+Cha5m5hnokBTicDsNPojl2kxczqbw0iHNxL7CjCgYniEOgRxLATppTKKPAZpXY5aNROlcbdPVBvzMDCcY665zWsOBzXFTec28ydr8HFL91s7k86B7JLwNAkshkejGsaF+KEZnTDaIYRRrVdFzKIvkEEGrLlKjJBV93xyyKB8DbK1NpumKAGC/fxMbh+nmqQ9mNRG5z9fFQo0y/CunQQQU8mZqPvWJG4bz8uNzlk6gWCwy2UA0a1GdkJjyZzSSEQurOkhnsJddaz2xPWo5YhtwcGy1E2DGnAJIRtil7ykHWUvfFkZ9J8RVlQATB5kgqzqLci1I8ESzBp8zRngRRgV9TvyevpjPogKDJNwRWWqI/rTsUq282Z3TX/tqS26nE6s+cVS+VZ0DY5wjXqVVNMFNYyTHfUgAqMoDvMG8tHR+E/kqhGssfoDgMJhK44VEdn9ncyusN6mQ0jFiJj1EyzgSHEtJk6xTzX5Yya/Nq+UswKZmCw+MVqIi6gB0kqWqm8Q5rd9uaeeoPVg3XRHuDm0kQO58rk9xGGwiOjDWyqTFKEvRtQZJ6mcgmVBMLQKlMfPTIow1dn9asA0uwWcluy/XQRJZlw5MNwEB7clu1cF6LEyHQ/VCabzaIbdx6xWmFEbaJVGEc3GAEDWS/iDkncC8gInBu7bj9EAdlyr8p0yIirVkmaipySguQoIy2zCksbx9aq4QSmqMyAtOarUW63vYTf3UDmcFXTyik1qxDDzDDIFv09rXpSeKkhELqz1EZ8qfTXI5ucYEauw3R+Jqdmlh79rGk5DgoLz3c2TMSGr0yskkwC+ZrYUwUq1NGJZtXsU2XWK0LVYzbcJJ86VH9dmu+pmzNhmjEwt9ZCNYtoqbdI4jo4ASKFb/F6mPI6F92hn0twiMyA7pgWcwrU0rzSCVTl0hpZmWQDTo3WSADyhpRxbZlTT3BSEGlaeaoJlolMyOHvYPIlcE2eKSoFi0VXMFOv8pqK/xEk8AkLiNAR+RgqL4+IEHICQNjpKkwzMGSkmvo6rb+MMbaB32CcTQDQa1RDXKpmFEVWp3AiA8rTxL/pb7mBZFaOm8EN05rUbQPb7yhvjk8xA+Ab326ZpKeLADLqZubqJcUJW+6KRtOtKuWXDgKhO0tnrJdWTyejO1CgZDC0DY5husPG+xD5ecQUK7rDmGE8LGtFvYgbQJJcxbZGMkuKMJWXTxmqygkGDJuwBpm140i7aEswrggHdxW+xZtTThw+DubWfF2MAi4lzoDy77EuBIQkgATwX+BD1BFqkKuz2Y46XdfUzFtBB+JA0UKRH0WhcDjSFI3EvmqkiMpk0Z1u7YxiIkm7GgP9wKFa+dxaO60EZ1y9uA2rwH54T1y6fBxtATB5QFPhU3Sz1a9h+kh2ahyCOMGVQ94or00riUaUT6flJLEUEHBToY/+LoXOpo9jQSB0ZywwppLeIYCIEAzIBgwnD4K4TgxGK5Eb/gsm07y8ljSjPtYQ8VbIYfjLy+N0meQW9rXWmGAeTrc8R2yE9boYA89R0QvSgugKl7AEFxfhXik4+MW4XXgu+EoqPGIYJoxKVRiPBbEISsUYKSb6oVYaC1zQAMKSuSxRhGNLp/g1eMrwofKVYBW8ORaqFI3DIdAvObgUTUi4T10XAoI5HCIwVA7tAX/SYCgVt2stRC/kC6RtOdNNWFRMYFP/dE9M+SAQBILA2BHoLd1ZQVdNLvMJAjNDgOHnmhEl6nRrl4gH5BDxCoJ4kIPKFAsi8lRaNChXBS3EXSeGV0mZuIJl5DItJMZ7hEr4i7XUohL+DrwBKxLnoTBFh4rD/6J+X+tDDRImSXXnsrkqb8z/ebu0FulxlTFXvdzVAQfRFFchgnu5K0sFfUGAoCL6R2tEI/lpzLhZfmt8hQixn9h0KxFTbxogMJ+oyd053dNTfmkiYELbz3dmhe4szRtybL1Gd/j1KTH0DI/UWkLsSU2uEJPrUdsyx3bJ+aio6I4re/ojZPPRhFxzgSGAhXvu+wlYVk2SNOXlMbRMzKrsqfREeRsu2/UA5yBJTuWUCcvYtoc46gY2SZiwwIhMSx3FtFEfrRsgtY4omUNBoCHQW7rzn/2v8gkCy4kAOccctLiOqkxGF9nDkftMHIxZcu3Lt5xw5fTlR4DyIZCZP1EENMI9YYW8ioKjMW9BVwKzJiwzg0wMRsC7iBYypNteeBa6rx4ym8VrvIr8quQQQdncrPKtEkd3BIphMOLMKKBTuSg1kWjHl+pCUyk/WRlMxVL2KaqSugDYJuRov4WTos0mqzz5QWAhIeCnm08QmDECnsWW6Xoot6jYGVeVE6eLAPAtoiY4DZ/IstZC6OFDiyPn9a9/veesbQLQAnyidUqEO9dqfcXChUaJ96o3LbQyy5lQLUJQS/SFebn/VYgKU27kW83nonUJUWK18L6+eoWFGK8RV3eiNfltpb0F6sTReic5kaZO5PlFqmyN46u/yFZtoOCrOH2hb3JoOb7igsLd+JrtPuBWqdP9pdYoI/DcX67kyhd55l4SpI8YuaNacJtlcXCuXSFaDUkEgREIePgUB6qNJEaUnONDV5vj6+VyQaDnCAisFjnUNQ+9bTD7ZNG4J4vtcbuNtEuKzNrMppu/aNL8RNbHYTbcpgiEjZhrQyBUwNJ68WHCyYV8Fe1g7/EDuwoNhLHbF0CUdwvxRjKoKYLZmXbL3UXQD5TvoudorVMT/95eLW7BHbGnu1rbQjwB++1EUtOEGxdpm+uKbxOgprUElWIz6A5nlk2hiIu4VC1FtHGDwUVKVItLUZiswJe2w7JK+BGcXvTOPewr7qJ822jKGgLLBm1/7JB1A+qh6Djdmj6XEF1nLwMcq20FFLrThi+JKSLQW7oz8wg4P6F8gsC4ELCdoGXS9vkQDDSuOieshxLACFlsVXGgw2VsNGdNloDo4UNzlmN/oHq9wOgrWr5uqRcfoo0ZuyWtNSMk1Ma73fzlTBMwCBvdSvhlmMlujrSHnRVtta1AHeJ1MrKWx++777728hkoP4OvRAsUQUyM8Fv7A6nfRz0Mv/gY9xLCR/OoHQ3EkImap5HQWigx2IySSIm3b9qCiL+GMClH8BkYUQT3ho2LMA9LAidrG67jLhJbY98BXatiqIybCg/jPpODitkUwFgAzUpDA2oxv8YM14l2GC+R5lYFWhtolwScQ0s4hf1VidbasAC79dUhcfoS6sGQ+Mjq7Ztm0riXwCDdL5era/lquWL3ingMz5olkw5hdXaBqjhosaVYI33I2xts3dQ9JekgsEgQmCJfS7EgMHsImGebLgsR8LfWnM/etdgnGr45scXhE16FyfTbbrPhCcuMMVN7arm+raJVa/JNojDFF/xBn2C62r58k13U1kF2RGxH8QxTdvP+7uJ2a8rspsN+8+w0hdlmRSyfTKEt2IMaXBRJskcAp4z1OHZD1h75Nv1jtplhDMPOwnUt8oB9rtlau0hLVyaySIowlCijllQmtsGWs8G0inplR+XP+C8bjyK0072EwXsVfIWYset2XKaWo1kSXDyO1suYdLN2PyJ+eFtF2w/TAjdcrdU8WQIlIupgTtCo8uB1rtHUGJV7fTc+hLujpBgGAmETZ+RssvtKA4xaeeVqu068hzfKibUtZFdlUS3eZshcvXnunAt2QUUih+yc2VpOvurezzZHMLLl7WplJDTb21LLTUajirrTBSfpaSEQdccvLp8gcDUKARRMfMvAFCJm5Cas7L0HvelmZYoe6Jbxe/O1JrX+SqvKR6I+NHmfBjE2w/YwBq5VmSbZ7ATvjzBqtbWSAwnL6eUw85747XQvEGDs8SSHbMqnkSUJDJzrK0No6Za3E5ipe4kY/0XJSBppbyG8xNJ6XpI60SElUROMgfYgUyPtB8iOmtPjBzY/9I6C4auMyAEOpwYKxaPRirkuFw8Dz4uhSZXvRZKYDZGD4dQ7QGEqICJC6Cnqg8R4lZLCmoSveNcSumNzRTkGS/Cv3QSwQ/aVsa86tZZnzVGigi5UpjdDwYTnCO8BQmWO8S9ZYsSA6lcFzvNzuWiVNLJ0JjRII41C3VeOkspasydroQqpO1iUau3bRHThAHUrWsSkJcbdfaiAkaWvEI3cBhL2yaQYOWvCao27DZn8dVRV/o5YN441YlH15paSoIiRkAe7iCIvIFOVGrTTXed3JK2DVTn+pO80OZ4+dzi2Z8QRRGOkhUaHjOTSMqsGhSX8HCS6vy+Z+QSBhYVAnFkLa7z62FpKOy8AGZxs0G0fY+lhSjCvTA9xFEFQs+kmTmCKyfixPU7EAMyVbRXIIJliKs/em6rSYEw3TXPlWGVDcqgNAPkvHGLCvfFAPfWx8aDXKdTEQnu23HJL74JAF5olVhVjY85tnj2hT6HbePae2bOkhQ2TTwDQnYqJsaCXSZhsObEWmojjAUgDYgSTqsEbK5lVtEloBXIjRFS14jBYKSBYbOwtGXLYS+qO5nmlg8k9aUeF3YYtM80rIfKDwMDUVWFt0BhCEUaFvng/tnxyiOEAOP7HfGI8gmCQEsJSaWwMqvdOMJkKi2Bltq0/ohKV70aQh/pVSxnCLEWoKKbN8Hch/WJf22IouHE24ZEs8TKRrzZP/S+bzROkC1M/RUkkAPkwNCgCVaaowBRr4P1xilHGKuDpfsA5DKu7EdT4BHVEVbxpblp3KT+aktrp3naTT3gV7cc+qTV+MggTJHEp8piGEeHaKaSpSguv8TtqzNW1jJ1FanXn1I9IHwUp83A5xV3k1jIKAnRoPxrmh8B97LdjWGl+AoOwYYNeA4ccO4uz0ut+jZrAIHqSjtTV8zcILEgEPGrzCQIzRkB0p8e91TFMoycpV4UnqdpM+ukBnr84Cmsnx+yQcVWMwODh7qhHqnxHnU7dIdoT5D1tZZIWKA3l1vG8ZlA9u9kSRMfPzBTWWQywHF9Fcngiqw25sdOg09m/Wjgjk2FgmIVxECG8CLOuyOqoU3r40xX/9YUhN6lVjMLBZkiYzQv2xAOGz5Wj48icBJNvAr3ppptSUMqNUk4W9E6bkTBldEHvqDuImvUyVaGGaV6BVjmj/w44s6owccXbISqtKiC010RUJmTYOWYS7wGjGCB81CFUrPkTjdcRRxwhEycjF7G+xhf1kVOqD5MpbcgAhcyhWQwntGX6S3IAnY/R1CmXEHfCojvaPgKECQnt6xQT1BRjQUZyXb4k1NZVnFtuIBIFHmCPbHRTPpqFubp/DATkiSJKaglOgJWS0PCeOh0ddA/TxlTofSOTNYYshDpY/V7b4VSosu5z7WEPBDBck7TD3aZ5sK2wYjwG2Z2sTvluTg1GKZCMKkYPM0CosK+GAD3FL+sQHmxMkc76qncGFAtXvpVxiJvYWYbYPdB+XHVK/gaBWUKgt86s/8xC8gkCM0aAaGGJLDPjcSxUk8TCzqmNOTEvZB3NR0VamOibAXv6M8MlFTAPNQlmbExkPZTZnmoGw8w+iRexJoWRIFfQORgSoSQe8YiCAIUqKVMCXTB7xplQEHUS3mn1GI/ZqmAOfAV7QDXYyIGwzapkxF/2XnRLTf0FNzAeFYjKho3wNYyo0CFsDwWsOjEMNttUHtcBCyWsgkxbDdVsBrjlTCXBFuos4lKFywyD17SeUcQ8YEKvYvjZS+MCNPgjXqgDhAkeKIgmGVMURz3EDP47VJU8UJGt6KnREdLLA4LyqgfPQ2X46chUlAbUh1DERYIMMfN21ibsyTTc3S5gBnik63Yzl5lGN9EC1eqp+w23qJ1v1Iam4M0uqtfycSwSnWbrOwEGkyvHFjJKCNEqMUYWb+u1izqkee5MmAy0s9skFeq+G8DwqaQAIeoIRmY/ZLrfHJWWCXbKmdNxKTndegbSmo2vG5SW370ZgF+bhlMZIY/TYF1exlKFtd+AthNbAsX3aV+TCAJLGoFZ4nepdukgwFozD/pr6w6/JWaPBeJsIqTbFkVQiGl020qkqzo0iPAk0/H2FStSTwW6MlTWv2AzdfQOd7gD69hK1iobjiEciE8N9/Kp9Tgsq2p5HFhiMoOIEySDXORc01z1i6hgFFtVLVHqDqMiR7QKa9GK4QFO1KNWeDjhdRZqZjJ5uxhXIoToXV+5CZgrTbVACSDiijjpEAIWTu94iDRPO6tCBph+oBg1C9vQo+ELyRF6QjkAtYgTCZeWieRhadqJCzL/JRXgo1xUTC+LrmEQVlJ3nEVF4+NjOykcihkgOciidewSOBBgWXe0z1ftqVBfp2NU+kJE4W1pMgOs8FQXNXwkB8V8QMo8y3RuaUiV768V4+3eaJlLMIGDgpcyhJaN7j5OSUIziKOL5WgQmC8Eou54/OazJBAwx+VeoV54KAuD1WfKgcgAQR5T7D/jbW7NA8JXQp/nRjFddm699twkWFV8YXJM5f0VqWOazmq6NB1IggaD2VAa6BbkfcabkSBsmO6zuBU3wyrjUsMb8pZsQ8wgUdAPSgaolgsnIlGYW9fXCf8y7drjEkQpjxuePlzHhbjA7IyHVWAnxCFsxulcRSWQ6CBFqsWx6j5lRQEYIk8TXkgmTxlpwStaXQhZ4T2RKfKjzkU6cRTSl0y8R8MU64pSulOvPm31W9hMDillwl/usDpUrqtWrBIkFrRsIJPaMYwPpuUzULK+ooMT5i+1TG5foWzQQ+hH9x2hN6wj7orRp+doEFi6CMwXAcx1Fw0CdBTWGplg5wQ91GQdv2GGGXW8hF0UFKm/NBi/NPae66Fia/AANp6wIZqHJkF7IHsoabJLRKFGYE4lWshEbmgPZsAidUr7EaXB0UN7oDHgJVxgXEUMNoZBSuGnED1KRGlxNoQKAgYaxAkyEMui/vrQJJAAC4YF1iAu3cgS1ARF6OZcddL//mt5Ld7+X1ZSQWDhIOAmNzv3C5r7JtNiPQE8MXju0PS5b0CuOBYEeqvu/MfBvHS5Xno+DgTE7mAVljJ5VCEirUoyjGAXVAMHwkK4OQTiVKAD4YEaz1kjE1siBYly8IT1V4iDmW6rpCcJcdY6Qu/RwRY51JO2pRmzjYCHJNXQben+nO1rTbH+apKfDD/yFE+ZSjGOaeogYU+0Fv+y+KepnDWuMsRXDwqh5dA2BTKNUbN1lGY+tXv4uC6UemYVAZEAtG2XIGMLuJzVa02r8oQqTwuuFJ4AAetHzMYsuC0q00oM3OgUIAvO29FKyBTgMpDZw68ir2lCXELhOj0cneVpkkcz/Y/3s8LeJ6yKo7AUSpHXExaY1UwL+7XNIoDuVeiUIq/FTg3vat0tNq209YPmLULURUATWRGOaZ0+rcIc035QQuY55pxoEwR6qmgwAfs8ekiPuZDYO1MpsWgEV5KwYuK4ebqndaEUDgINgRVbKokgMAMEeHYsc8VahFjO4PSFcoo1zMKWrTBfKA2e7XYS6jgQzbyHL2T5vRBywh7twVHePQFPnJKWbfN4dstzZYrUFottT5pu/lym7WJgIyLOzREXxeMtUJ9V8z/i6gLSSafi2UEq0FtJCHOYigarpfXtXBHoIrR4kyuHHmkPZaMg1F0sOcmklRxOWOlGuXSKsbC03t8SWa1tFF9lMmOwyhVg6PF+3moh/3jJaL4FW25l0fromo2v6rpIFcFGnaY6NfSuJVqfE4TX2y2hciXB7kenSdieLpMKpru70nA3k7OUEYi6s5RHfwx9F4UjwMUjqVZWj6HGVDFNBBgMNpsDQqjT3ET+ckGKxba+uvYTGmivpfX8IHYSEnsrVJxa4K+GidaiSXQLWxpmZZYVfF0nkZV9TDtnSrfk7KVtoGcnpBE+fZIDFw8LXevAtURhah+0kQ+x5xI0iWoh6i84jD6hy5YlEj5FkjHSvLcKW2Mo8kxJPxnTA+bcr8bpiGOdLhMvMZRkD1FiMmuBoemEcHsL95h8a84JIbbMdmkMsqkdiE5xRyvgHBJ1juVY7YVV4GqaoRL5AxJsXddfETNEHVe3rYAy5BZfBdPwbXFMiGZTwIAaLNH9VhRiMHxPWl7yTKunmzCUIoF0CpupfagdtUbPR6aoOwE6PhqMNrk37Edga1AV1nAgVTazALuLes7oCGS69ScdBKaHgBsrnyAQBBY0AuiFnz2LMpZemK/bMGmZVU24vSGjziSX72+gBqFduEXLRIN86D0m7i3TCjsR7uwcQ2vpnCVmdh5ieu1xgFIrL1ad9ZVup3QTBAwUig0W/y6M3SHMyfo1GxbbH4hDSvS6CHT5NA/G1SonZht02EC3nm5aUDy6oIy2VT4Kwu66BFJi6Zx4fLqLQ8gEq1xr+5XHTuw4JYerl4HH/yzT4zirkpb0a4lMp+ujTGsJcRT8AC8UkVOZuKwNdbANXbOZgmL1qSV1tdOPHMW0p3bRtIwOU4GVfMoQPkTjkbb3ld0Q6vQJ/yJqmo12tKPaj474yp2EnaBTdcjyRoxnmSH5EHC6rRxwJoKf0dRsRBBLq3rK322s66sRqV7XV3/dHuhdbQLZMpPoOQK9DVWOM8sPPJ8gMIiAaS47N5g739895pg9xr7bEKbIon1WVqLlE2D4GQdKtqPdhP11WEEREpVJmyEksGecCzwLtdWQQx5hCIQZdm0uIIeGIfDFajuZyESdzpyXWTVlr5zJ/mJF5vSCZmo3gSrmcmwb4y1gxYeB5KBRIQIkQWhRwFmTzfKZf3yIGMC+0j84Ynh2UBw7F2AkouDFixAMXEvL2XUag+77CtjJ2kmPoabQnxqYNjWwDJD5t98x+61t9WIp+ChG8RJ0wu+J6ADHukLUyoaB9pBEF2pTKJ4ju2ISS8TfyKH9uDrk0QtCkW0IbMFQ0SpCdKkp+JavxbombCeWCXn3gKMifDFR73yQRjFpSyXA4D0AnPD0yTLhrEmO4hxoWUNAwq4H1cLJzpWPgfFGgYWIhR7ZMQvjBFd5OUecOOGh4Tt/wmLJDAKTIRC6MxkyyV9CCHAEcEMwqNVnptHEnRk2KTfnrn16HDLzttyM3SI8jDCQSirAmPl0ORN7ZqLPbIhjqNn5ZBDbz4axpHNwo9R12SqOCdaC8SAMiNuQz/azcCQKfMUM3rxZJq5gcs8ZhIXssMMOk11Cvshx1WI2KlRYTIbM8rb4K66Ct8IhmSpXWLwFo8VUV52MFpvtRJVw3NTmjSwcQYV97XKvKj/w16Y+LDQXCVPdDlGVpHmXyBIvfelL1UbwADiTT3vQWWoKKtPKDyQUsJURyaR2vFSzbRKtp+MiwQh91Fl7CIER03I6nG11M10DbOfoUia4aSw8rD2l8CedAprKuYRq96OBFvrK9uNJro5+oTXoDv2DLdfNGkE7MuBJbckVsjXFoDFkTv2YrvFCCqU5HLuOwuHGdHPq9G4OumPc3cNIoR9Ic5WqFv2tF1Y0DtQ9sdJeSaab9qeoHaGIW9xkTtQ1PMyd4yYBmmpxRMyPrGUsQNpCwv0MQWr03Ql77LFHvZ52+ELJCQJTQqDnsliat3AR8FhneoeFaM4Ljy1PutY1T0wrQUzEHZL2cSILpABjwGwId5XmLCDs4x/MPJHcrn1VA7JiQk8DwEXkeDQzzFz+eIAgWQGYyIF8FdItzMvNzmkPcuwiqJgdBT1q/VpUUheqp61Hc5kK1kth4ZMexCQNJT2s9U7mhB9PbWX22WcfJ+IoVYZ/R6aaGXKJ7t7QA5Xw+BADvOSIYbY+RYItrPcJsNYKYzPm3BLspapswyitC7ojwcXAStFOpEf4aByFj9NZWchYAiOgRCZvgky6iHT7qFMcBpvEetU7pxxqzixQMGCYWSuPlomubV8rMeDMqkyWTOBRK0mecXX8suVUgmkU0UwdQeMGDnW/aqTT8Q+iggS26qgONjcfbDE2mcalrksD4/yiJHXrGUi7G90JdXfVIbTMTgp1ixpKApJ8pNOCI4niDW5LaYMoaKaCiI1RZZLNuORoQvxZ7rrCih6DqPkhuL1xqdZmN7Y2o85uftG+6vTbERWkg67ovkI+ZNp40IXc7W4VVFWOnaXwJy0nYvnqt6AA5Ul6+EPuQq8VNr4YG7qmDI2zvYPCKrA6C56GG010m2Fm9ZMZrlCOwCzsU+90Vvv9rquYe4wc6FfJw1U5ukM1NGqu6xD3ZasQe6OQVXxVy0yizwhgtG5On3oK9aep/5kO5hMEZowAA0MLYfCQA89Wzz5zOCGZ3Pb4AdPuSY1bsNAu4bmsvKcwRcGDklvBE5PAztaaOyrpb8UZsIKkFFEXXCqemCboHnlskqo8Z031ah5cdTrKvnpYm8TzBbAZZTlYTZv5mlyKaTBJ9dj1zGWz2XKRp56q4jZQE01lk1gdFIod8tg1t65lL76ia4yEC3kEuwRTpP1Mu5ZPBhooyDNqY7y1gVHUWkoDo2jyylZZ+TKwpqZVhdtRWZTxYR5ci6rBu8E8WxlUPAzPqPJoELvLlrTTJRAylA4tEOta5btHu2kIUJvQKY3xSOoum2K2gd8KQx5RoJQU/2v5s5QwoUdqCWCsoI6AGoMh2BBFRi+hYqcBgsa5IbXNLeRE4oSBQAJIVoYPyWDF3WnuT1E46IWRxYD1sRSRgU4Rq+h88HF1cpRVQiiyelB2j3Uyj9ATw4Ti0Lfcpdi2m4quhhOoSuMNvdds+YpXUcJk4ql+Ne5t94a7xb0nE79BZzFypyDWBr1aojbUmahmZCvTFd2l2qwvyK5bhXxCO1Q5guuGNxNwrvIu5I7SeF/la7MrVrUDf91USJIWorZoZd3zfragK5rSVkXx4vnN1p1QTR2oqn0l2/i4OmFGj1q+X7ffY/sqYXR8ujktrReWd7WvSQSBmSPgXswnCMwYAcTFzXfggQd6fuEcLE3FIZqlsdAMSS1bJci7hMmc5z7Sg0OgRB6aMk1PRQYw6kyUGWrFKnKOlACDP6nQbjfVQo9jl2MIW4OZahditEw60RFTbbZHGVoFccW5jIH6MSo8TLQBPkQnEBLhWioR5MvIVW1MlwdrpQdUB5lMnZI2XnO6zrapapXv/mWfTMSLHJgct0Om5ria5z5nFlhafjdRs3aWiTKBf1AUIKMAYBnFKsl+A1xalAZuV5l6x1JKgwK8WB1myblTprrKDPzF/you1SwcRAZCAegxzGpj5h3FBVlQpk6XHWXLdYHplcYe8EXW1MtTAY5KymTanUjIscO14a5Ve7WSyzBBQ211M7hzjKbraiRAUJM6HWVxCcMEauzQXSHfR/1YbKUn+4u+oCPchT7cXiyokoaD8TayhBxEGWtERuVjvaJhNAnIoMaBJqyWMKO/4pQ1QALFUczoYD+UEmE3zvXRU+NiOos6Q75V5RQ3m3ueTtkykwgCixiBqDueh/ksQgSEVrBY5tCe6e5ysrm0GSHzQHUobd8UkN6AvrCCZr1eocWIMgk1gzStRFNQEyq62SS9AUxUmZqgIwdmrk3PGECQFMSDYxtWS2/QGpZSNEmd2EoywKypdrJtaIS/crTN3LqVGZ3wYNIFbWah9cu1WD7Ta5nDJ9qehJdEN9VPPmH1q4wTec3IRayvdrJ/w+fKYaEpBIiI2b+gTsoWM0x4gBUwUS4Gm00FNe7lKqb4Yln0GsWxokom02v2j8yZx5sZs/oTXkgmm/2c5zxH3AllAqEpKimYBpeSWfGtBsVry/jOlNRyFMdw08bQVpVzzRgdPIZ+VmE3Wliewde97nXMPw3PLcFHCQqaAeTRL3QKK9VI41IuIUSkdrUhg+kyCoLRYjzdMB39qhZO1h350PAZKIAr+1SmQWxHyVdTiQXRBR6idlYl3OQtB7WtNI7YMluCTKX7BrEknJafRBAIAnOMQJxZcwz4Yr4cWR7/wFfMsE3WaQ+MmZm96TtrTadhJFg1i4D4AmqrGHBQxYlAqIPwXuEFnBdMr0yeKdIFYmEvOxyCujCMHf7EMUQpwSRYSqGRam7OHe1xImagAYJgVG6qzV8gTIFFxxtYUwKGyAwzb3qAixIkmGQWHeFA0fyl4ghxFbOCo9D5sQHyCYuu8uH2yNF98g9mgJ2o0KXpClYJVZddhYklIwlGmfB0mcqjblrIu1fxwjKxCvhgCWQz7ESzsUYkA79BdDC8cpPTRWBOIVMDlsbElhtxwmtRm6BBn9AXDWvW2hDghUZE3yl2zsV+tNlVAA4ElTsFg3E56PGhYJx1CQXIUc5VALMsn6PKy73SbcYIBGpdfRUmrvi4W8hCtKVuDQsijb9yw40YhQXRizQyCCwCBFbwTFwE3UgX5gsBfIJVEwlhnw+yBEuPu/DIiG5BQZAVhtM0neU2O6fi4EDMKlbEVCMQwm4svhAMwR5wBwhG5goxn+aL4SixA5v4HgEKSAOOgmfIQU1M1jmbCCe1FNbaY1IBBFRO1RAiQ9LAURhm8RnCOZUU4oAGIUYMp/bQD5RUM17CqKtWkIHYC5YJB2KzNUPgpF8H1UrkAeNNq9AG9lt5EkLxgAlhd5ZIF1wNx5qwQDKnjgBOxodlgNCjEpCmfm5KBoEgMPcIeE7WnNMcbOBVQnPfmO4VQ3e6aCQ9bQRILwImOD6IIiJnqTike74PngIRIRQaISxkEjIAyaciOvEGAgYCQZbAWqg+EhXPS9eR45B2WKiFrziRRkIiKicUgoJw8H0oYI10lTT7pzfIcQk1ECFqeRGmolWkBdyFKlO+M7Em6izVYdq9zQlzjoB7w5ARBY3anF98Hi5ItyOPodQ4d4sWF+WGiNPMfJD7clCaHrjPu261CZsLQObH/T+ZR3jCs5IZBGaMQG/pztXMRPMJAkEgCPQWAewW352X5hEXCUvciHNzdcROkBlXKecpFyEHqOsi/WRC5L7ELYKlTDMH+iLqs8yGCQxnt1pI+zLLDxQQRCVEbG7irDEzsV+G2xSommF6U0NvDgMHkyL5Zi+EXgmOVJ5xcnIVNtFyOmdxfe1mkpM5bbv5Sc8eAglVnjFTzIlBIAgsAAQIbHQ+chobLAZ5LC1m+SypE2DEn2VHvrHUOa1KRHFZ8cflyic7rRNnVpjBthMPlmMTgTLz6mHU+XZJPkLXZRI7ZbJV/AVk1GVeSFC89YnDfkCxZQRUIedVA2e0ACwaKsANInetvltzwP8r4k17uJgNh6htstPARZESAemUJ6KUs9wJ1FNB7k5BPpyuPOW19hSQJkpZv6a8Ym6VikDHqJyC1Ylk5/XmjBb/h2vyettGS4y58DXgWKyH4vCScDrLxG/g45ZzIW3DC2m6QumxRhdCjATIS4vJI5W1dQMD7c/XpYLA7FG81BwEgsDSQYAyISLKc7N28RlLx83yrRJnhhm/4QrZsAnzh0tavsejWtrA8NHRORVnPVyG3mD1HIs7fGggB6uw/RI+QSnBJBx1Iuoggb7wSbHK0ngViw5ANhuzqY0SLOyqja0tvhNeVkvouYNtfiNODmVp1+JKttROJBynFVGq8q1/NCiIhXi1bkmEwHo6TjGqjxM1BhuotX5VHsVUnmcNvUCwbPyD5RCWJpR5QGGjBJyjlsUhprzMljGqQbC8Xaz4tdVv5WC1QTHrFi3jF2bnPVmV6XSR+NJC6NA4CYjhQzi0NOggg35VYeoXBLS8vmqA00V3oT5A4Lkj5zgEDW5QaYSscqp8/s4qAr1VdyZeXeLGyicILHEE7L9n1fQSB2F099ldC/KrjCVprDirU2+AGn1i96gnb/erdD2LJYRbmd+b7pMNfK2/VZjdZfPsrUf2QHpcuvKH/wrtMo7aKXbeejex6lXGxjliy3zwjOGzWo5oMLtlEga6JQXTMNiICDaAUrTCwwklWXoag22NeKkQCLbZPgs2F1CYp0klxRKwOiZcJq5jeV3FuZNbKuZGpq2qrNRTwHYAot+YcCJHuyIZhopDBlOJYrX7FJYDGfwDoWklLfUXhm9ZADcQKkZS0kd8Bb2gtShvj00kTHmUyJoDDUZ3kEsqC5Gm1dMSyI0eQVjNVgZgLSXzWLKg+6QaEFF0irI4C4OhltmaAUWu8DuZNBjijYWWTsGQ5CBDAraGbw+HNAktay/tQmQBq9luFSAYFBdVTJw7nyCiZhsFPZWTz5JGYFZZXioPAnOAAPHcxI61sDZ7XJfzaLaazDSxTSjHVfNAPWwtT4EudPPN6Sn8M1MjuvUMpIkl9ADr2Cufm8MUnKOEJZNg/Ep74Bfw1S58o8MdLHZDccy/GR5Wud7pYd011w96AT29YJLrWuiCIHQr7CoehW2GLfvkqMVxThfeKM1MskyUDHaarWXUZVI18B6WuDSJkjRs+cNI25aJfkAUGXjrRV20/oLR8hAkQE/1S8yvfHqDeT+La8EgG0lR6J7STeuIHYYYbN1xIX8dtUMBBUIEiWYDwcrE7indtHuSgSE5yASRXaElvAOrQnCkbXyMD9UpBkJhywzra/1lyGVidd1MaQKMt1ZVJlUDvNxGvmI2btruzeP9D1YMVEk0CPIEG++msOTQisgauFrkX+oRatJEl1rzqGF1+mR/cUElBd+gOxqjwXDGe+wV6X4zZPQtqzidTg9DYa3idPtpVbuQX5wxJfm4fyx9qLdtGJp6OcmAuqPxXGOtMTpLmrJdAp1M342Rv5QzTBHBdT+burhbeL7aKUnMHgK9VXey744fZj7zjIAfnumaRzDVfQZN8VxjyUz1ymE/gxqGT2GYTVW1pybZrYB2Slf8RGVqvKctosAkV74y5rXSnrZskjk9s62wfARCYU1lQeX4atNCj3i7SJvmMvbsjYgH0Qnssa2GZKpfJhOlTmKGar0aQp11dWaebMByIxau0vYcqqPdv+QEO+WIscBIVOUQK8gMqBmxcDnUgZ1jFWwBwIproYX9pvWTbe5HFWCnSQIUheq+Op3OPvGSOJH6gveQDWyyR4Dh5NJO1t2+hZDB8Bgnp9hWwOVQB2mmEZNAhoCgvLm+TCXpGSrEbMgkdB2Z5ARqjXHHVHwd8YEYOgV8akErphKeDpsJwRxoXluh8na0mwCLUUC5BIVA2EaRusMvI0evSUes+whtCc2qxYnqN/S4kcqZ87qRpPVUhd0rTjGt5TYoquAbLaFeIKDiVwyBm9bRCesxBJQVJNte2IYP/4PkcMl2ulCedtTtit36tJxuQo/cM0gzRgtzyo22uRlARKxy67rPUSs3PMoFScVQH3cRLuUQYN1v2Il7GymsXS7dG9Q4QTyokmuR2XRZF1Arw2EnLZfwcTnsUwLCrqvNLme4AUsxci7Gow1u9W6Dk15qCKy41Dqc/vYQAfZPhES9DsmksLWQ8RvtKaiSnrCYgcdibZRHw1BhyRjm0Cx6t85WeUuwtV5ASNuviXvle4ERLUGrWjEJdZqYehkW6qC8pSIybSFIWBIDwf5VYVfHJCzRp6trm8o9bR1ihDzQOUFY2bJwpv7lNTB5rQ+7pSTbz94wopVZPAmrYCc8u9l+fEgx83hGFEehnaAOrlUNGP4rysH+hMJBUEN2UduUQX3wCdeCnr5wIjAk1A6+AI4eWwfpCGsxXFvleH+T8BGmxY7YJJnaNtD8Xm34mUx2t6ymGb9LkzSYK0yI1sKDQxNqJh96DFW7EKGCDwJLqKhYdAfmRUeULL2hFZ5ZQsPK+KEy1nJ3rz5QIQ9Oa2c7ZKwJNnQIAgbGQFpohwYSQne5Ztw54KKgYKUKoFAwwRSl3QCGACeQJuH4O/CpTIWVBG8dRU/pH7iC8UJ5+Zjc6uKc7HTFw8XthfdUSayCjoIEKOkOQVYMEEj5nowv6cgPpF2xfiluTlwfpZOv+zbkdD8ghW6Mkppa+ZYAkSHWWee6b93naCiU/ATcSAKnXEs9hhVi5Ew8BvXBpFFtIh8ugntpqkp4SB0tXoj7qkFnzSUEJPkduXvJgVx7lDaKUf0GNcPe1r6WJgRM8JqrOEVMND7HXUgJA3trcBJLEYHZU7RS89JBwNOTNbVtIDvq2coMe8qY3NOcWcSSowsN9pvd8nRDFyqHJWB4zM88ktgPn8onj3uieY6bhY+OSCVv+Ol6xNeJpoweux6sjJDnpkPkijo0/FfLPXDpBBwBzLCZepVhMzyLNbWdooOqMgdl2skzelriPBtAbvEU1t/m/WEeFC4XRtWAoHjs1uuiqFDt/dL0D7QAC2kXkpAp+AD1aZlsA1LFaQIfFkvsrUNaaxJMKWEwRGIO+zva6apyLpPAnDhdY+oQU4RStGISQODgQEfYBpN1bKZ7dCBtSt3Gqw6J3aGUSLuiLuiINLPH0VAFqr/SzCfGg8BJo6rl3HHnGEc95bqiQNQpbgMXqrRZOw5aaSat3EDF4Vi+yh/+S0tgNY0OUYfJVMBdp3fuRg4O/XW54bMqB4dmayvW2Fgbd/neiGJ8SREoiOYZgslOZ2WVdCP5IbgtXVFJvdZyNx5aACUjAhZEgePMVwzDfYsbKYkBGGKZsHLb2DdcJn5Dr3IToqcqp2rI9FGJrxrpx9UWzyNzrlsl2w9B+4XvWPQkjKbuPT9bpxtuKCEc8ptrGAfCG5Qf4THkPPKT0Qb9Xabn67+NzZ9Fi0BvnVlXW7SQp2NziABGwpbXhJJNNZOrhyz/Oibh+VuMx7SP8WAeqND1UNZGE2uPV6+39Ng1za2HL+vCbe/hbqJmUljWfUSHWL5mckzc2VrmpJ7XwhXZACrRhKeba4oUqUPiCZjbVkzj0bX2tWyJvx7o2mY6W4fMNQk8/EQ6Xo2XbybNxpiLt9NL/GjzS2TOIeXxOZ92YpV3FRauS3f4gJAATILmZAluvT+1CpuFm69TRBizgXra1ZlVxpXMAG3mE8upQ8N0R8PU1k7kH+n2ouVXAgsxmtIm/SiLBI7Y6A4TW5EZtQCYOVcAdHQOidI2DDH02HsGnmkvhY98pZ3K1IcqRg9jTWlUYlexw3qfaI0Ikg0TQ0zYuOqMwf/MP7qjjA/CV4cJGL6iMhxkRUMHT/vvd8YeWzJ8WugOaWVQRoMis8vm29GWqGAU95Lda9w5LV/CV15FPaVHup3wDGqNv2g0PkSPUaZlGgU5CshUXsLf+rRBd+fLaV/rWr5eVfCvyH1l5m8QmD0EQndmD9vUPM8IcC6YaLKRZBUSQgVjmiyaztaTl5QtXEArBbKw4kyjtPIUaU9zaSaBS77bDQ4as3/T7poEm0N3jw6nWaNGdxz13MeoqphJP1m7yw/a6eb6QikJEpXDGJTYUF8H6I5MxIslxoFYyirD50UYYKdJ9xwuzdIM0x0ddC12F1ymzk6pGrh4oMdtBzdT5AKE1CET54OVE/1VAC1TLUXH04SSxPbjWBQsPjgdwWbQoKpz+K/YIOQDB+VHwMMwJzIJG69HqIaaUdVStvBLdeqUDx7gEsAcrrByeNCQPFCjIyU8AA2epcYZESJEySHEBu4tJbW2VDHtx1qwXkIXTEgaCBDHIjajg+p3V6jBq0bxJFzHO7NkYsYYGGSqAfQGHhyfOlqZE/7FpJ2o5qIRymAYauOKGtHBCatKZhAIAiMQ8Cv778ziaiX4jSg5x4ei7swx4IvzcsJTBLW4xck8NYVluiyuqdkkusPQ6rm9zkgLBYFMqkbN+Lt0h+1p5gdHIUjQD0oPGIHdAN1Rkr+srCYqoGHW4Ex4Oo+J0EiTbEcxAIa/nB2+0qVYa7ytnSiqtDSMlkPJIKvQSMzdERTUhC1nVrEHQSEMebkklCfVqI2WgN+oRwRPVYLJCYuupwOUinO4aL2Ti5tPSV1QWFgu9JQkKqAFcnjQgIyyYBiAYrmrzuG/uAUdy4JkTaK1+MjBujjmnChTRwR21IlYCGqicINiuMKWY7CQvK4QRa5oKoKjTaeR2ehgO30ggf1oZ2UKG+LB4RUdKJOvQSAI9BmB0J0+j07atlwIcCKYhaMmQl8ZYxP9qo5VFtKLyogtKOFBRAuhRfgLeqRki+fg7MdpKB8YBvJBdeCnwB6szVGnsB5WeUQTcSyKhVAM/KnqJCEQMwgPxBJxLWSk5sIYrkertNmltYq+Uv4Cl67oY5WjBc4SPaPNuqlHVBbOO4qLyFPEjrQgU5ikhCsKByHeUCAQCDa7rsjwYzZOqcyujKT7qIxPN1MlcirUo7WZfqaFfBOVg2QQY2g/U+ElrZI+JzA2dwJW5yOoOVynz4M1S21zhwt667qVZ+lCqXaWEOgt3VlBhz3E8wkCM0aA50XUCB6DGXArCOlFHdSGLtReapYiC4HkNJFJ3rTaAkUQ5ik2meeirosoUCwoImiEuE4F6C6IjvtTGU4QushkLcSTEAWMh2JBnhFsq6SfHHfYZKfMIN/CKLIHsuJci5hIMlwwPD4zqCqnjEAA4xFHrICQHX7DESWXwqGSx/RUgvcNp5cQiOaXZfnbokSgnhtCpsa4r8SiBKq3nfIkr2evp309jXvS1NCdngzEAm4GD4V4CHSHsCEU18IZrEXwja1dERHPaDpNozX6ad7mFErMwuqziFQ0jtMHrxKmylnG+dX2v1lYfUlrp4UAfxy9kKjG1zZizfm06pxiYa493k8/Ky5LYdrUQUqkcDRR//S/KVaysIrx5Ipys8yQKLuwWp7WFgK9pTsTbxiVYQsCU0eA7iL0daC82SdqL4TFwqWB3Ws8tX0Gyvf/qxXXgm/ISJrK7UXKwvD63+wRLcREmXDB4yPKLPMQKsD6FsddZuFuAdbaiip3AjYsnNmucd2jw2l3FH5pkzqGf/jorObg8byxIqjEPDW6Q3cU/sxniv7O0tX5VS27E+DPMcoravdFW0SaRZgwoN0zvqhBV3Pdvf4aO3/NWIij+mIgeEgFb9lewdZHxlc4mvmJGYuS5T/1kycyEVBJtgLyaE7VGCsoxewboNr6sjJ5gfm7MTbhfQLCZHJXab9iKjGgtcWOfDsfCtiXKb3Qf1zV9/ztFwKz5L1LtUscAUtmeKC4q4SbLHEo+tl9dsWac5pzrZOacSNr4+MZbLViVRcGiWwxlhWLPboNFpx7dGLPo4vhcMoscynf6EqGj1rD1cKw6igTrj3C0ocLjysHL/GeB7FZSE8t+OfzJXvgPQiHHRYsDBQ738Jc7LzAw8VZLNO5EzYDX6kNI3iKtZ8Qa5tBJYXLoB3ojrV1ZaIqur9C1qw6VJJ/WbX2iVbA9kj2uUaALMWvCyEr6JE1gIRbG0NUJq6mMLIoGMtCS4MuX/S9qHyh647aIKDC821FIS0YDhlyyohNpKrm/O0tAr2N3cnKrN7eM2lYHxGwlMzWwG3l0ew1kTeQ5R6of8LMgTJT/MrGkArYlRHruaZSle0ZyQBTKTlhGfvXacNoukNcIW/YWte2Nw15QdzGgnXk3/Epk0+KIEuokMwg0JuEIMd16U8uZBE+ytJ4ACLOvqIRaJ9YMbodkmSNm1B6UVlUHLEjItNlCrq38g5LIDxUL4SlW5FHsZDJxVmL8IlVIs8QDnvkCJO3uSJVf8JeTzez7W9UJ9aOl6Lvbf2nsxWe76KEFmE9ouZdvRGOgWsB04hb8kYlQimQtsZXBOBjnwLzzVWEqVF0OKOpLHiMu8VRIq6QGmAiNJiWmkVZYTzWFmBgWqK8TNRQJvyl4YknGSCkSn41BnR0MhSqonOMrHx16pcEeqeq0J3CaiH+7S3diTPLLyufIDBVBARS2IvWk3q2Q/BM621749NtGSNh0fuM3U8enSb3ZvBsGJ+FvQBqn1yXqHzmHD8gt2BaMgkAthUQpVSBVi5tdm6dHTPPlivAnLOLAtW7jZQmMDBa9kyi37RDdtDBMNi/2hWw8uu9H63MhAm0psI47JT9iEc8opxHmoG7iGXmM3UWjkLtsDKfcRU/rjvMtr2ISsPgQ+ELs1sg+2rVfQX5YjlCzu33g0wIhbEjok2VBKQjN5xr8LfHsYEGmrAtOWhu87CgNcgT/46VgxCrfEv67S/FzPPQaYN80E3Yo+XM1Dt7GXDtqccQ4GoS+muhgHGBj+5MdgmyEHmGA1HvoGq/AxILSmcfbSQJCBSa9nKxyhc/pDbgYIT8dzB084BFJsEGVUWGXBFBrDsBg5RZr8uw4yjobD6Jjxp6qyC13KCQcPi2dAT+hJ/h1rrxErszDEtylgeBFZfn5JwbBJYUAqQFPiAOoAGu81994ZwBKAgAPB21ksuh9hVjMPsxCxcqIZ9hUIwZwDxaDXbr8bhnZRVjzgkSDqELJArqhTk9m+GtETK5oqx6sz2PTCEXrYaBBE8QoyIC1FQeRWC0FLACDjngs8B7CAOuiFiwWFaAkysE59qLmY2vzYuRAwEZ1A4lnch6qUH8DcXFrL17OT4OJbWcfsAA1yE98pImqoleuGK3/DLTmo0nmfqLOGmF+XekqQh6zaDWGyf0kRTB5GMetjrEXZShE4AIy9FyPapNEORjdWgZx4o0gYQ5l1AGSaJSqF+mnKIyRZt8bZ9GfcSv1FGMAXrGlxXn3WPIW+E5SGiP+xMOOmvjhtbN0ZfWcjdS3TnIrgHt9hQZcofUrlQiiKHto0KZfggSbhj3HgqFUBodt40YIH4xOhBuqoDpgbvF/UANsjsDfilTgL9ldzbTwlmdVXcyBkk6cofYkVKZ8hVK5BMExobAQtTK0ua+IUDf5p73LJtxw1gI5r9cD60SmSy9T3NheCL7auLYMiXo806hvZtlVkl/0QK2lgDAwvE4lLRutsorIYcm0a4yYUJ5+z77sHlVv2K++uFRXLqn4DqsrDdPiVowOfbsdpTJ5y9QmE2td1ngNEQRH6pA/Xo5aJT0xFeGcGIuyzFUNfNEMCHWf3nBgkP6Jd/VnS6000dwBucFDYb8oIU0Bg4Cc2jGo2oY+GsDGxdFBeSbzXNjwbZsT2UiN2y/o4aAguJa0gw2QyXBeDudKZLGbxjUNiJoDZMmvz7onaiOci3Z+7G2LHIIJeKJc6Itmljiq4r/2+lqHu3MqsJGkyUuKORojBNbzJDbr4bJVag4rX4JvhslaT9ws0k0p0wdFVvGeG+33XbWOjHSlcnwI3/d0yuNbJG4uvlQciNVjkvDs9KqNZRokzuzlYdqu4ta5tQT3IW4lJvfx1n6gu/WWAuP45KTifPpZt1C0gDBHia7BEaIcIAU98VKq5i7Vw1a7lxCTmXiqTL9ajAY+ZVZ8g8Obd8HlVQmByLKRdAiL9GcKpP84/TahMKNUTgbbsqffG8U0Ze6FppIdVMDJ532tzd2VT35u4AQ6K0zK7E7C+gu6mlTuS0qnpHhZM88qsz8tJXNY+24fsgD3Uf/cDfYzpLBu5sBslKV6bEoPKLOInjwFLDxMj1t8R4TTfNyRwUQkCXq2c1zwaJ77GIh3rdAj6lHv+YRZjzQPaZbJMFweygxzAkdQj2e1PhElWFgTEmd3j2FU0BsLLNtlbKOl9ljgeCgGLtCY6jyJZOI7YWMJz6yUjRRQgG0RqcQMmkleayaBa3T9b37ri6ZbD+qRMlAVmgSTm/mv05pf13OUY2UU2KMc1kamRW74xAHh/Yo4E0g5t9m8/Bsr8uw4qbm67pm+t7a5sQu3aEr+NR19av4q/HFn4xmDSgZqQr463RtGCPdccupkA9F5eWqA6lFQEQyOSJ+YFVXZ7yB7IbBsyvHXyTSDVZaUcuUcA9wXZHQ2iFQA584hGu6WxDTKk/E0gAj2D0dPTV2vE7dzCmmrQFUoY8BwgkwntrBHLsVplPMoyAlsWBguDKcEf0RBIuvjXLjZ+V+5gGslpgh+OpyeAle2Jqn8X5i7oo26M61b5a7BTtsxRylGqL7dRdVvl+owfVxqEhw5cNZTrekfF1TsjHpVnMSCwuB3tKdxO7UkyR/Z46AObfnvvP9pUl4ZtVkTqCoJ6kpIx2btlFmYMLLCElhY2jdEpwOeIZiqA+d3C/HM7H9fsz/vIqLT4fFNb9nQswCMSrlTX+LWkl7XrPNdBQXRXfk+DCEDLzJKBpk50OGgUpfGkwVaH/Voy+MgZXnLsTM1CHWUQdRt1ZSQmc96Jl21TID0GDjNV5fxGAiQMSDKs8o+ogyYTh9ZDqFOKQkzwtixJRiGN3Kp5LWJFeHiakzQJZ5ikcntA2TDhojX+sU/ZIpDTE+MpxSNAb7J4eVpZpw8ZCXeBn0iK0SwKGn7BMWiDw5HVwUHcaYoTXXR2XEl/DTcUVhhMYXPr7qLPNJWVFtOTKwEKNsZRB8Jmw/1xJ9RfOwVcQOmShfEkeJXbMZSPIV0+sq1A71Y97aiYgjrMKDDDftDU91zxDn2u6F2LD7xIiAjgRIzUKIDT3uorNksCKRLgFeQgXOhAXaB08jdRapKo1NAQSiWm4sKG0DQVc4BHGo2jxhB0dkciMSnNxXRkprVYJJY3KIDoaNbJEJSyypH86IqloL9ZcKpYPdwkW4uzmV9hP26eZrg3ve3Wsdfsv3QyhXV8uRmOyGdAf6dEtKq7aixAby8zUIjAeBhUUb09p+ItB1dlQLuTw8+2pyz6a6WW2+PFnjGSqeFA9xBIVhqGIEfGd5IstnSiuTkiHTX06WNk334GYIFWCSPZer5ICzQ6bAAhYat7Dk1UTWWpIqOfwX5RKwOZwvh+FkYruHWiRKOYAcYvIp/96fxSQIzyxKIZ/VF8paen7VwFhSoUzEfWXveVvMhqXZMN3EEdEI2klJBegam0dI42hj/rktwILrsIV4lck3Mzzw0om6ir9IiXOF35qp0zPQPpklhNS6Kq11RTa+TlHeWKBi9bXGl+1HHxkwMa01shXz60Qflk93lNeSIojIBKce869TpAhlUCg5xgs7kWkUUA1wgcjATfbKCKwItj5KOpeG4SqiTMgYQqGF5vCgYTmYSrXWX+zEWU0n0AYMzBWbwICIGwv9UsZR8SLl1AOpHB1RuPqotsr0VcmW2a41kMDwBDsPZPbqq9vMWOB8XVlrWi0sSanuommdmMJLAYE2Oy1JtT9dvlp/mpKWLFwEhulON9ah6M6AvN86a4osjMA8FW/AJGgDdci830yaDeOJQFBkssFy2GD2nn+K8FAlGWacgD3z60JHzOnlF90xuS8bLEcjneKomBWV+FqnD/9lD6gsbDkXGFLFfVNhSQIzXZeZlM8HVLbTUQ41YoZ4oIoQ0hg5yITgG66u4liuQr9hZsgM2sZay2FZzcsxPLactIAfWOEiX83EBod4JWSWA0VHhFBoGIUDJqKGlbTCxekCYmgPZAA5E34IMAJ+qWVqwzlQK8W0RD2kGmmeMjWU98dX/UX48DNpH3FRdnyx6hgD4wsj7VQ+SiHTWXQy4g02UPm8NvABV33112jCp/lNWv58JXSHwAZD0kjdTojg8jQGDbKXjEVJnJUjvEjLc4lxnesNa8YLw5sx3TG+Br2xyXE1LPUsDgR6S3fizGKA8hkPAh5/KqITsNxIAHWHSiHt6c/QIiUTXoYJRGJM+j1/WVmWmNLD2SG0Qhgp3sC3Ui4PeoxQCdIL2RxrYY+rQm4p+SyxfL80j3J2uoJz2XJsgP6hKpbeKfwpzhJnKlPOhE2iInB8CF9AxXTK6qQ6C51Cd8gk8pk3r83i3OH7oLV4TskkvXi3KLpD3ihjL+SF2lFb5JEikBiSDwechmFC3AGNYWiJzlZ7WOKKOVVtyyT+13IbmWT/Kokpcg91cybsERHI5QgPrtsKdN2LBAkfh/QLjIZADDUyVIX1seSZdm4lcC+fgUxfhUYNZIrm9hnInMevumO88FFjQYHD/GqMZtwkI4KsQ4/4ZFhnXM8cnMgN57M8Fxoe3+WpLecGgblBYAUPyrm5Uq6yuBFgO8kzPB0cK4In+Dv4aKgyrKyVF6a8JYAPg0A54GQheLCyWIKvYnvRERZIDTIZD0qJ4FkSBTOMbcjBpYRH1PbzGAmpBqlinrElhkeCW8e1cAVKiWJluVWOObnnRVcMhw4Mt22ZObid/dNQFmE9TB0246+1P8s8cY4LUIaYKBy0NqYbcXUjxe1obxVqB11qRMlFcMi9wQEqIGY46GQR9G5EFzhJaaX4+ogyORQEZoaAdXmmhc41PyRFz6yS2TgrdGc2UE2dSwUBRId6JBiWJwjf8iHAVBhy3yCw3Aztm0zQaq1lCDl36kUBLTOJQoAyh8620PXlhEVVYquFUhHPRlfFbIiGNgFocfejyy/zKG8s2ZVZmlCcW+bpKRAERiDQW7rzf+L5iKbnUBAIApMhINBYJARvGtGIRmU1UD+5jvaLjlom11EMJQrXmXC4sROC4hg3SsZ1BGjXxjYTXrFlYidurVo31zKxUgpN+zqthJ2BxImTYCvqblrnpnAQWKAIhO4s0IFLs/uCgKm5yCETGqvfOdr60qwptIP9FuQkplgIy4jiPIDWQInDtYHNiGKL/hAcKDET7lywzL6L3LeCbKCYmHcLDIWfV771XwLYLdzz1V9ERCS4ND6NVYtFE0/WahAsTOkR/c1pK9raOjL7ThlNoe4kqDq94uvbKQMJeyUoKd6IJDlwKF+DwKJEIHRnUQ5rOhUElo0AtYB/ZIstthCaPYLKiLnm4WJZ/V12pdMsIR5chNY0TxpzcdFmXgslwn10vdag2RWmQMAk7M2DLjjFid2IKEqMLQ9sgSOMrCp0FE8S3CaS3SEF5Av5Ep6PaLa3hlEHLfXHPyxws9BPtJn3KriExYkiIUStdZsnPky0vmhr1fJzYTxWPoq5tpRPvJqdGpzubwtp757b0jbaAX7oTgMkiUWOgLDNfIJAHxCwOLxtXT977TEbtgjZbLhdwta0zAzTYlVXyyTY8OlYV2y9WMucLCEQ2OlTKTlZDfOVzyTXai/awIg2lMsDULwnRIhuSYuZhZnXPtrySSDYQ9VmkbxDbTU7kyzUSU7beUi1duuxlMkptnNsmwY5EZjoheB0dVKhLKMTBu4qFsEJo2bpXRfPqFMoH2LPEYJuw1pak/A5Q2mtGQWuNiC2GHDnnXe2Ug9dsD2BlYO1EB23s1DfvgB2s2yvm3BnyhTojRpa0qVm+x0wDLXy39ZQuAUaJF/LXUiovijvtlECpqgwZlMr+QVHK0mekS+ApraM0kfyj1PsKaDXlvrXSjolReLbWIgfTbp9dFydLtdyJFAoOpyEVY22PdTx7tHhtHX4+JB7YPhQcoLAjBHo7UL07Lsz4zHNieNEwGbHFmHZdGeZz+jlvGrtCmiFfKuH0a3dCwde08MusihiJlrJyRLmx6bj5TuYrMy48hm/tr1N1cleWmE0kGni7m2gNIZlXlcIs24O0B07yKEIIksYWjUU3bELMwrIQjPMVS1SIm7J6RaZ155DZYYFwJIuKowJdVC4xldYt0xbAtbpzC3ugsF4IwF9RUCufEv3LeIjhGAnrijHVgUohaAiK/IwDCW5DnECmz6TPRRAHYSKt9c7VOXtr00srUrDSLiKlKSm1GueECYLCe3jDNIqjEXZncie2r7iUhYDYiSislRem1KC2j6HjgLE3gTVEWvZIMAtxfdka++6tbC39v4NDVBgwpeWqK07cIWzRtpcgMzTutCu23ImpDsG0YV009iRfFrhyRKhO5Mhk/zlQSB0Z3nQy7mLCgFz9HpPU7dXzIkg0LI03fyxp03rzdpNprs141jMcNvqpg6RMWxXY8bcLTmcrqgdTgSkp45yEDBXbCqqxGjhE+0s5tOyZ+azZAnXZX3pASw92YA5Fz2qMPGJnGAFEPnBfkK1HRyrKYCGw4IZtla/VAF6mEuwmjK5Wlhcp7O41CahIcw8mapdfcLEMN1hp1lNaoeabQ8NhwoiQVaYUlsiITfVpMc97nEYqtbaxZi+oiTjrbDGoBe0B9IOQch1FbCxjYTttkFdpNbfYg+tYQy2+CduHTmApXAUi/LVWvEB/UbfqTKg49MZGLtWYSW8NNSGQ5VGkmwfIK21tk3SsFZ4wPwbF6wLTdGkEm+6tAM4NB7n2pYay5Ew3G4Y1bYKKzF1uuPdHaATwiygp+BqVdk4qsunJ6Q7CttayeiQiJqE1moYTgz0d7hAcoLADBDoLd1J7I4Hez5jQMAUmUkw52YIW3UyyfIy2YDK9Mhm0b1w0WyYRSfvyyfvs1g8SgMbgYhaEI9JTlCJYmiKcxm2ehXiQDRDuyhxnrBBGDDH1SRtoPDb/U8BlpiVst2fCXQrv5wJddIhRE6Y0FdV+stcyTRZRw4k6BMOWQvDhhEzWF8bC7FtwpwJMGQt6oW9p83ItVlJ3hlGS3QIhgElwaoydURJ5dVjOQ/mIRMmuCOCYm9Dp2M8oj0YaV9JL2qQqNMVnuIHo/I2CWqKFrKaqGGFgNBg8A8Vlu3XO9zF3tM0En3EzAy9wBHSDtKjO7ZARHo0yXWV0XiZxnTEumsEyLmUIaegvxQUXahme+xWVa0XuA5iJBqGvjJwqJUZTmhha4A6uzvu2Paat0vvnEVkEs+EgEKDY6hevGAIcDjgKACTit3RYOTD/t28XWogjMkHhXgdo6OkK6pHDiQFS8n3cwCsu5qbzDtS4Fw/EFSYRlX3qtGsxpsekNCokn4LopXLNWYgaEvkNz4+rit+xiosAFnD+OkMX+X4q8F8c1rYciQ0G7Z+Kd33XnULJB0EFhkC2VV5kQ3o/HTHo5Mhr4BNT2TTXO3wcDf7Z5XNjwUlkElkKiM8gnVkpJvVqcIEHrNSNqP6oKTTTcrVY19js3CTbPMGb3VAd0RR+Gs9bXeb4DqRzeChwIo4YhgnhIOCUmuw5SMBvmoD81zll/Mv04LfMORsZ1XlilrFOLFtuJd+oTv2saVdsZo6RfNgrspC4z10FOSG9arTqUTkB0aXxdX31jzdVy3CV+Sj8tXGKkOSaMGh5hDDydmEiFBHmGfGdVr7xBAwkCd22juVXLFdvSW4e2yQjZdgMAhNt4WtzECCwWaVhUzBHN0pg60MAsRVRNEpfQvzUKd8VpzVF1TkzhG1M1Bb96sXXamWSFPbmnUPddMuBHNWX7OxseIT3FW+UjhAbZdI96Q7B7zoNa2OfoYKYJbqwY+92wspoWARwwyiMngzoADuJsRKwYW1a7Y3l1HUUBy3AZbjdP0ylBgwiuP2dqsYETeDiCUdx5KFNHH5ufkVdnMaU8pcA9+vSTupO6AuKqaYmv1ANA/p1J3uPEEHaXvKjP4QpWiQbpLRxXI0CCweBGYgVeWUIDCAgAe9Bbp24/XxWK+jNBVmQxgvQtAyGWB8SLzqQA2+YiQiTirf/Ngju8IzBaaonHFyiHdGbVUG3RmItql8f822sQ1KEuOBV9GBvMmhjqqZbRt4gRdzMmNnlmoZIcapIkLqKibTCEelmWT2TLp7XZbPRasAEkPvqbS/8pEhAhhDTv0SQlTuMBoGE2VdD33FA8iEXmHWDmNA7EgvMklN6A77iu3hoNxMCrRg4XaJlkCt2Hsnent5hdD66yvmZOBKxuD+K30I/0BexfbycBkUlTD8dk1VP8bGypY7j/6kBiYfy9FNxQwEQobxGBe3hKO6UG0QDMR9Y7AYbK8HkYk3c/a5echa5Ac5LgofQ4lSoALNvVU14AFoZaUn+1ucCSy20m4OUwxGjr4YF9SkzkUpUBaY85EZ1pbJpUhyc2n3Yb3XzCHI4yLoJqyMfnUf4GiE02V22+NE+YVbN39caZzbMAEWD1tmnUpa3N5+FMssnwJBYOoI9NaZFXXHszef5UWAFTE99QxlZc1rqzrMhlCPRpjQtwswBma6ZbBb5nBC+IKFPCV4qJANLn8QE25CXOXtm0eKGD5XDtvJRtYWKTw7PGUDSv6EZ80404PAufVXQiNxEdpAVaiRuux9n0w+AsHECsFhk4RZCNRlO0lQHHlVA7VAsUMOOQSr4wkiA4AUHxJMjTMRP0gjuIXC1AiHaAnq59Ap34c0qQODoXbgjpiKsBLUgT5Rjfl/7d15oHXl+P9xzYMmpXlQJA3mIlMlIukbmSNTKRRKQsoYDeZMISoiZEgqkoqilMwkaUBRaVBK8/x76fJd3/3b+5x99tnPfp6z9z6f/cc+a9/rXve67/e9zro+67qutVbbt+fBcKUoFDmyIc66Jx2Hw4Bt1gj/DQcAtcHBRn75mFa23LTaSuiN/RZZs19eN5krXCBEwJ577kk3VKyE20lMiruIXOAOMbMCRvZVPSFG6ULCgumtEnUAJB24ZDSokOvL6KAgVX2abas+j8v73ve+Wp7smwTEsLLUmzqiTj7Nz1ogWyHtLJzQe6RjPlUZtFqg2n3aWvCTvOgsHGAJBU8pcryRa1M2W27Ocj5NWTkVQmA8CETujMc8zvAo2EImlk1iDtnv6g0bw/KVOuGTqDQOtopBVYcdlUCjjhN047dvhsE8cxK4U4bZ86ptZl6gwVrShwOjqhE0jW+/2bBZYGK1z55xAvEKMOdWueAmGlhT/hJX2wSTll0WW1YokYKhcluQvAcuFj4D32pyAOjtZC+QshW1IaLhETKEHXVFH5Av3vNFBHBp2CP/B3+PFGag3v72t3PeCMbROrpEx+iDsZBHJIJUGGEvDjBSj14hVogJe1dTnoou1S0/QnI8DRwewj06CSNFRR/oqpo6w98jRxgumk+Oi8IJP5JtmWH8aTWVjdGyUfDHmC+tsaC+2UUpRxO24LVoreUidBM+Jpja8KmalHHrJhw2rT8t60PrG810rNQVIehTlaXCcKtQseaL8W5roe0nqaSmQ5TSals1Nj/l98gccgD3ksYEqc/YjD0DCYFeCMzXXJL2Ujt1QmBCAmJV5IicRxfQJA6r7NEjoh6CHS6L+QNcsjOiPBY2t0pchpFTwY05PEBusXGBLheB9JHeIfVS0EpAgX5yNU89iJHJlrCt/AbqgenlP7A75pm7aMITt9thjjzySJmbLnZVEC6xOe+F3QleaJaQsiPKQ4c5OaqQdFDIZmhW1EOhzvvQIoI1E449hTNCQOKLo8VhQJBJJ+reB6EoulAcrXENdq+ftSEQAn0TcMYubyg3cF4R2jfGbDikBGSbSoklnUUfKAkWyI1R+ur2K8ELl+YCN2xSxSa4IuRSSOwgd2RpuF637N5pF/0cLQQHvwKdYXPOG6rF5gIWpWkIIzLFMkXCzWPBqiGFkm7NTQIOM/FNGrTTNTg3d5u2QyAEpiAQuTMFoKwOgRAIgRAIgRAYdQJDK3cSvh31Qyv9D4EQCIEQCIEQmIJA5M4UgLI6BEIgBEIgBEJg1AlE7oz6DKb/IRACIRACIRACUxCI3JkCUFaHQAiEQAiEQAiMOoHInVGfwfQ/BEIgBEIgBEJgCgKRO1MAyuoQCIEQCIEQCIFRJxC5M+ozmP6HQAiEQAiEQAhMQSByZwpAWR0CIRACIRACITDqBCJ3Rn0G0/8QCIEQCIEQCIEpCETuTAEoq0MgBEIgBEIgBEadQOTOqM9g+h8CIRACIRACITAFgcidKQBldQiEQAiEQAiEwKgTiNwZ9RlM/0MgBEIgBEIgBKYgELkzBaCsDoEQCIEQCIEQGHUCkTujPoPpfwiEQAiEQAiEwBQEInemAJTVIRACIRACIRACo04gcmfUZzD9D4EQCIEQCIEQmIJA5M4UgLI6BEIgBEIgBEJg1AlE7oz6DKb/IRACIRACIRACUxCI3JkCUFaHQAiEQAiEQAiMOoHInVGfwfQ/BEIgBEIgBEJgCgKRO1MAyuoQCIEQCIEQCIFRJxC5M+ozmP6HQAiEQAiEQAhMQSByZwpAWR0CIRACIRACITDqBCJ3Rn0G0/8QCIEQCIEQCIEpCETuTAEoq0MgBEIgBEIgBEadQOTOqM9g+h8CIRACIRACITAFgcidKQBldQiEQAiEQAiEwKgTiNwZ9RlM/0MgBEIgBEIgBKYgELkzBaCsDoEQCIEQCIEQGHUCkTujPoPpfwiEQAiEQAiEwBQEInemAJTVIRACIRACIRACo04gcmfUZzD9D4EQCIEQCIEQmIJA5M4UgLI6BEIgBEIgBEJg1AlE7oz6DKb/IRACIRACIRACUxCI3JkCUFaHQAiEQAiEQAiMOoHInVGfwfQ/BEIgBEIgBEJgCgKRO1MAyuoQCIEQCIF77rnn7ns/FmaExu23337zzTfrQn97t/lb3vKWs846q7/Ns9UYEIjcGYNJzBBCIATGnMAZZ5xx4okn3nrrrW3j/NOf/kQEtBUO/OeNN9747Gc/e4kllrjvfe/7kpe85Lbbbhv4LqZs8Ec/+tELX/jCBzzgAVdfffWUlTsrfP/73//e9773nOc854Ybbuhcm5LZQCByZzbMcsYYAiHwHwIu8S+//PKrrrqqbyfBdDna0Tve8Y7/+Z//ufDCC6e7bdW/9tprX/e61z33uc/daqutrrnmmtZG/vKXv2y66ab77rtva+GEy3fcccecaJTTTz/9+OOP/9znPnfkkUduueWW0xVYP/nJT374wx9O2LHeC5/xjGe8613vuvTSS++6665mK/qv+cntdN111/373/+21rflxhF1/fXXP/nJTz7ooINMfVO/aSQLs4TAgrNknBlmCITACBG48847v/rVrz7lKU9ZbbXVBtVtxm+XXXZhBZlMxu9Vr3rVOuusM6jGW9shUC6++OJHP/rRCv/4xz+ec845/Apvf/vbH/zgB7dW63F5mWWWOfDAA7fbbjvKpnOTLbbY4v73v39neVvJfvvtt8oqq7zmNa9pK+/xJ72iJs8K784jHvGIBRZYYLINL7nkEoNdaKGF0N54441VPvPMM7/whS8cddRRH/rQh+aff37lL3vZy5Zaaqm//vWv/FUm2hRzupiUY489VvtKqFKi5KlPfer6669vRxdddNHJJ59spxaa/ZKPRNi3vvWthz70ocb1oAc96J///OfKK6+stXe/+91vfetbKZ5//OMfyy677HnnnYfeE57whK985SuLLrpo00IWZh0BB18+IRACs4EA38AHP/hBwYhbbrllyMfrOt65WPBigP1kArV59tlnc/B8+9vfZpgH2HjTFCfKtttu++pXv5pHAXDeHR4F+5U1ctNNNzHqTU0Ll112GROuP62FEy7/6le/0kjr5ocffjgxYV/UVesm/CiPetSj1CdxBKGs+uxnP2tbYuWxj32sVV/+8pdb67ctP+95z3vkIx+p2g9+8INa9bWvfY3k0gIB9/CHP/wb3/hG2ybNzyuvvJImu9/97rfCCiuoz+1k1S9/+UvixjLJwkXk4/AjaDbYYINVV131hBNOIDrtgv+J+8pWD3zgA+36Ix/5yJprrokhROutt57lk046SaEKRIxmP/OZz1A2hklFfexjH1OizZ/+9KdrrbXWC17wArG/U089lWaibmmmL33pSyp86lOforf+9a9/Wc5n7hFA3jT5kLNzby99tBzvTs1LvkOgfwIuMX/96187+e666679tzL3t5R+wQzYzyabbPLa1752UDss++oyur8GXdMLSbzoRS9q3dwl+5ve9Kadd965tXCyZSc+q+abb77JKlT5+eefb4HzgA+AO8HCAQccsNhii+2xxx4UyUc/+lFm9RWveIU6HAOMqGo1od/5znfML9vJ28QnQZ3YZMkll+zcnTrUpPpWnXLKKcsvv/xOO+1EtLGyLC5PzwUXXEB18UbY7+9+97unP/3pdu3IIQXWXXfdzga7lFAAlIF9tUapuGHe8IY3PP/5z+cQ0gHJuVpQc8EFF+T/IGLsV38ma5akOPfccx0nxisAx1wZMteIwSJ22mmnSd/pEge0u4UXXpj80j7twqFiYcMNN/zkJz+5yCKLcEQ1+yVNNMjNQ+vsvvvuNJ8efv3rXyeV6BhYuKz23HNPMwu1w4OLyHQsvfTSTQs02Te/+U0Ont///vePecxjlHMp2aMe8qI98YlPrJoSfbRmR36a3zpUmkayMLsI9CGRskkITIvA3/72tzl0J7h6636h8J73vMdJTRLitDrWpfLPf/5zV8Ms0IorruhKnWmcrDJX+eMe97jllluO0ZqsTmu5K+Df/OY3rSXzbFmqB4vLBn/iE5/ostMrrriCKwIBF9w2YWmaDptKP12vN0BcnbNkfAy//e1vbeVyvFoWzVHTxTcFoIRbwloG3q7ZXZGIqsYEil8IMTDJK620kkaU23CNNdbgSLBJVfPNqFMSOs9m64aS9773vUwgySKrg+Nh7bXXbppttmoWGEVT6eS+9dZb77DDDkSG0bGOHA/6wzY/5CEPoRXUL7cB0+gAkHajRLfNLwHEIcFIcx68+MUvblpuXTBYwoVEsDtbAQWgIIv9SnzRvSOOOIIs4/Wx1W677carYVlCDHfIH/7wh9am2pY7vTsq8FRpubwdfmrKEauT1IMU5r///e+kSbWD7ac//em2Njt/Uhj6rxxt7VBOVafiU+Ur6tyqKXFsP+lJT/KPYF4+8IEPVPaMpsTRyLKmmgXd819TbjDHz/vf/36FVKaIVXmVarxakLTkIKlta7z2opwme9vb3kaWbbTRRq3Hs4OExGn2hT+5VgchXQ5+vDsNnLm0MLTenfvMpQGn2Rkk4BznVN69A8yVOvXpUtPtDC6MXIzK8utSrcsqvmjXoKLpXep0X+Uk64TIB9Cl2o9//GPXds985jMbM9ylci+rnDEf9rCHMR6slGYR6L6Vq3bpIM7srdVqIupUW+WVMskeGJTlpr4FZ3AZoK2bT7YM5stf/nJXzE0F1shZXjIKy9oU1ty99KUvbbNzIggf//jHm2qdC1pmRF1tMzw0hGWbqEYGkSB++nz+85+vDV1k31vw3y9nOuUMLXqKTFzFEUiTqqGF973vfeIL6qgpo0U5ycK5wkCW5bZKHeW0VO3FtwGKsxjLXnvtpRGmnTqhkxgw/RTZUZ/9a+p3LjCQ6tBSraue9rSnaa2JB5kXbfI3qGOChGb+/Oc/Oxgk/SgRHiJiyBRGvbWR1mU9pC1aSyqYJc6i0IGqD3ZXhf65zKahKTRTrVu1Lfcid2xCZjm6tOafDsP6d3CpoOSQQw5pa7PzJ/cM706VkyOvf/3ra7lHuSNQReE5fhwehOk73/nO2tzQeG4sc+qUIDZkiraOAXLTYWCtfwFuMMet5RqvQ06bOr/99ts7WZW3T0BWTYdQdRVDrqzakZFSWkcffXTFyxRqgYTiqeLOrGCZ/+iqnO+5RCByZy6BTbMTEDj00EN5dFvD/G2VXB65ruV25qNmyfxsq1A/nfqdlVw8Od10Ob832zrKOy+vXWiyKHvvvXdTbVoL+vCsZz1LXKNtK6f1No8R4+cisi7Qm8qGVpGmpqT3BVfw3Abqf/e730Wg8gO4H7jQXZT7rhN30+COO+4oOaP56Vz8xje+0YbaYQOqnGosmDxGbECJA74T+kMJn3yXxAgtsOuyKNghBpUUYzIV6p5lHgVz6jq4Lm0lptB/ckU333zzxz/+8eVOqD5MKXdoL11ijRxFhlm22ba6V2bJjPCmlC/NYWan3DD2K8pD2/HrsLWkCaFDx1irJjun0NhdbYtN6HDNVJn/1u5VJ827/rOy9dO3YJmMVwuGph3i0rJ0ZodxYXeQMIEKJ/tMKHe0Y+KaTdr2y/zz3vEH8AkZmuFMKXdYet1jkps2W+WO4Iu1IlyECH8SW+4QNXw2vrv/sovcKXeI3eGsweOOO07j4C+++OKlp/Wf+CDZ1fTvoKmmb20LPFuOGV4Q/7D6edhhh6lAyu+zzz5Eg4PzmGOO4SBs26r5iaTThSCaHXEOldK11lGkNX4aR6lwIciycMyvA8xBpZA4dtaiKVUTirIJt41lvjTXDDbku6Xm/T/KX+ZyI33MiP8sJN0p5mDTc4UcOSqoTwM1pwIHidjWZpttZkf8SY7ApsNZmBsEInfmBtV51yZr5F/FxcE222wjA2De7fjePTkh1nmnx/26jvRf3aWfzixOha6B+JBZiyZT0lmSe6DNinM7O+80csdJ39nZ1ZXOsGGWy2HOhjl9yxMkRMQatKMmW+gky643V8+2clZywnWSYgidGZXYxInMZbRL/DohNn1wfW/v9tKMncqhDOzIGJ1ea++11iWgM11T04JcBJs7UbYW9rjcyB31ncTLQUX/OXVy+FuQA2GM1Rq74lTbqsCcjg2QeTP88kCoaSD6w8a39oELQWtKVHZpXkxaKzTLbLPKsNNSzH/lnDZX4TpDDNVFPIAHH3ywmfVtj60qakq5U7tj6VtzWisfhS2kKmgdbTKKanaaYZrAWpaMmOYiwqGEEf3kCrsaN9GmxjKbpPL+++9f5c13m+xgpFUr81lyR0hL5Vavg+zmSs5oGmlb6EXucLDJGvGfrs8gk2iSdRQy9v6tGGkOBr4fOb9tjTc/uTMdnD7MKp8WD4cYjc5TpepU7hTzbNlMcSzRcDKEBLMcJ00jbQv+VY1XIw4A+lU3VDCn5VojZagBRp2zxJEmBkdnw67zJXdUlqVrLnwkx7T5t9r25ZxgR7wstRdr3/zmNytxvPHVWWh8Nm0b+kmjmCMBO4Nq9cw5D+iew6nRr041jigl5KOTCU1pc6LZhQTFZplEs0nbf0rnHlMyhASGVu4s6PDNpzsBAWMnuLJtzjt13dN9kwGudXJ3+pAk6LB22uXAcK7UvuUvfvGLzkGsbLM7F4hUjkIW0YmvKW9bIBEkTFAGyp0TnQSrAlFFnay++uqOVx7mtq3qpxtKqQonX6c2GTNsm6tJKhAlZ1tb0VKkj2RY7Ut3UOjE5+Tb9JOHQ7Ymg0FySVB17Wt0XAK2ZbmlUhqvjArxeHtkXFma1p6QHQb+s5/9zBnTtR3xRLFVBT7tsrVNfY4lZ96yc01hHwusBRXCeJgObeozbcHekEFkgQadtSvdsho3Ec7sjDrlZ6bMnUtkNqlz14J0TD7r4kRvIgyHfe2sViWG7KLZZTE3Bm+KrdpqCsRIv6AC2QyQaRElbXV6/2kem8rVjqCAa2tjb9s1odbMLzFkLf3qUDF3bsNmz6gfByQ52DRYC3WU1qU8hpxnlSzcVs1+HU4GRVLjbK2Ldd/MIWKEiElxBCLTtmHzkwoxBX6We0PIw/+RI83xaZhEISVnNh1+Js7uKhPZUaqmqTdkBpg9duARVRKbRGfa0qtrX/5DxYD8K9lWGJQiMXZXC9Q5hecgB6o6T7U77P2vweIeaeK46W3bgiPc/6mcX8eS3vrHUUGh1nhH/LPjTKIRuyr47/btWDKW5l+D44TotxXgpVradtH8dDw73VX7VejfqmKL9bM5XTSbNAtOCz7Nz2aBwnbx0Py0oMOt/5WVg+zf36eqGUvbJq2bZzkE+iEwhNpw2LrkZIqsk6yOOdc7qzrxOb1W8h23v3/R5srMdSpHvf95Z0/ntRqLmzCr0FnPglObchkANIEzJgtd1RgzzxNz+a41NoNlVc5w2rszl9Ou685KYaEPnEZFNFwxa7PcM5466iTyyle+0gWWM5ozaTXb+c3XQn+4J4JwEdt2KlfHcNzX6mwu9O56scIWtS2p51Rbyxwb+lNRAzEal5s0R61ikKrP9bO+NW5fBlU/7Y494PDn53DiFmdxUW4VsBwbwCp0yWsUVV8h+1fL9c0JpOe8327ZcJ3K2jVrjYspqivFprDvhVbvDlvlqpeRM3YzAgJDS0wYTtM+kdEEs6xlkySU6KGt/KxE1PLu8O3bipEzFm4YbB0tqjmWzB3j2uqyatq34KijUxndMpCVoekQYhtoJoeog6QuyoE1NfyCDkh9Jrxszi5yOcjYdYHOzPOgtDbeuky+cAM4jHk4yK9aJYYl0KC3jj2HLrefcvNr3plAhay4xBqFbvM2fHPBpLGU3BsKKRU9+fCHP2xZ9xwSdDAFKfAhAOEo4jarcJWWDcq+bC5LqdwJhmwWHHgqkwhQo0Sg8ItwPxB/FvRE4xN+zBSwnCs0B0eL41w1JZryj0NImYtmQ5Ol3KciwnSYOv7LlJhKbi3/gBVJbDZpW/Df10QwrSK2ao9t1fIzBMaPgH8T/+k+TpVDNbr7DFVvhrMzLJaZk7PJLUGR6KTznZKKGshMtFzmnwlhql0bkTIK69TvxMc2cy+7zUEhI8qqVQvMnvOpwnpABTeJZVefTD4JJcRTQFz2Oem3whGZZo2UOC/bpCwcn0ejulyiMVStm7Qts7u6xw/kao9RtJYpdfa34DxOk7lYbDbRGmNWPyWT2mONVwld0mStWp7QTvMY0WS1OXNr8+YmFD4AQKySHIBS1Wkdr4taV6h6W6vqW2yRY4n1Zd1Lq1W5vrlibq05J8s6JuKmBXpXN3wDRRGaXxPNmOlzY8PMqUvtUsDVJaaXLCOSfExf5XoTZ7pNCphxuHhByJ1KsxXoIQU4FbgWWjVc6xA4mdAjAR0exK7MWTXpgMYxoELVp3XU9BH7oJUrWFZ+kSr3Xb1tbb9ZJnaJD7EYSoKTpiknFAzfx81ETaG8k3IJiD40hWqK0VCETYlZI1wqCgl5rAAAoTtJREFUTEmctapSxBzz9c+lvp8VhTE6C1Ra04hVPs3PLIRACAwbgcidYZuRafTH2VaEyDUlj6tAT90ryw3e5M86ibuNVou/+MUvXGe7NmULmZM617OICuXwcgspdKWoJoFimYFktCw0V6UMCWPG/0EiNK6CVvNvW+aN1iEC3E/LjHEM1I74z0mlGlh3ucMsOSLVdGFt74wxS2OnTYYQJ7zISzXlu1Xu+Ekn1XiRcZXvErxqknqCApZZssaYMU7sd+PdsZarn58fB5Xd31vOA/YV20rxsWtDLuMnZKCHjdzBti7izQi3ELnWavxA88SR6kx9G6ZZ637DTmv9ZtmsmR1yh9dBZ4CqVXrLo2C+rBJ+qjuhCD4/KUKSkcOsKtMWfBhUsr0L6DTZSwINwig1QYZjmD6GYxI5eCr003SjbYH5V5NYdBDWzdhVAR/qFvNGJ1krpVShVXpS/hXSQR079W1VpdS07aJ+OsZqaBOuTWEIhEAITEYgcmcyMiNQzjyUOBBIYno5e3SaT9uy4AsT4jq47iZQztrRK5wBFEMTLOCZF3WSvcjwl4gRiLG51BZX9p55WjERHgJXvS6mXbVbW0/Z0iZzq1kL7KI0CCbNdTMJ5YKYN4Ul04i1FBhniQWmTuCAARb18LPzwy8l20bP+ULsqK7U5dxYJpiqsJSTPTKWohIElvJy4It2cRhIBahkFEOrcAPnRD3QXbjEkO2Xz1+XNKsOz1PtiNBpHnvfPNijYi6gMeTo2aQ4s9Y8GRUB0SALbVwwiq/xqwn9NP4kJrzxrzRD5u/RWjmQmsJeFvjkStf2Ujl1QmBICPiHbf4jZrBLTk1kvROg/sxgN7LrGSEwtHJnQXYln+4ExOwpCSLDJW9Ts1Ishfz9PwtzcNKQBS7lJUuKT1UKjut1Xg0WWuSeBGHa+UIU+hbIoFG4EGgaCxwAUgo4AGQn8BmUGqBsanfklNgH3wmhQGBJhiA1XLtTPGQTqcThoSZjL1IgUCK9g8uHDCKtmg63Luink5FOUgNu9SQyrOUr4jZwr4RVomyVEG1oVAVfjk3IPsm5tuJEcc+zgB0/BwXD/2F3WjAWyUNcR24Kla+jRAVkNEX9cB1VhMVAJI3iqUKlKFoQ7NMOmPI6JY7AaHTK+W90xiqhLr4Ta+WFKBcGakvCpavsSwzO2uajRBpsTVZT2MuC21t8eqmZOiEwJAT843iojAwq/y+9dEnCE8vkn84/JuduL5v0WIefW+aiixwxTf/CPW6VaiEwdwnMiPobrZ1W7o5pYGKZW//J+u/CRRIGO8pnIzpjgatGuX9ygoOp9s0ei2s4BxEBFAnTziniQySpyZGjhCuIPpAIqYRwEbshIJR4vkUTa+CG4ViS4EwJNR4j2osOaG7stLlPKRWJtPVzbL4lMkvTaSI1E44L1c5E6QlrpnC2EeDwoOzlIVHMPKNtw/cf5/bytsJR/MnZ6TRV6d699F/itpi4Rz+0Bi75mF0COZO0tiA6L4DLZ9Na2H3Z9Ynktu51snYsCcS7M3dF21xtnYeAN4VkqRfd0St2x4PipnSnUUKHDJKL48Th1l/eFy6WuldTRMm5g89G8EgL1Uk+GGkTwjF8PGJDNm/unhUDEqPpHIuWSau2clpKnkpbob51FrbVGcWfnFviZV3ugDUoSEdxaEPeZ9KZP1I0cG77uuRNuzdKGpm7AgfORIKXRDFOVsPhZ21t3zWGMKvU+9bCEV2WK6bngr899l8emGx02fdS6JpNXFO51pJX7oynKacpq6rQkzbd8gkm57RNBHxdcTmDySBsnEPuRXBidCLiFNdC02wWQmDGCSSYNfUU8N9UXktb1VbrW//Ykm1dWr373e/mmHG7Cpki38W5w4Mr6iky/MZSSdyxXE2p0NZmfk5IgKAcOVZO98KRjAH73eQqTTi64Sx0xe9AFTPlUOR9FB+ZUIsPqvMQeeKAO/jmhtzxryoE7DqEP4O1bu0zN4ZM87YJ8n8qNu1ixqVOE4txO54AsescsV0tmF8NOjJdoHN7MPCuZOqcoKacM5tzBiusk4P6TgIUCbcH7ViF+mPUMvfdvufsobXWvjXLzieSBeXMadC8uK/TlZLoLaevs5NVrqY8WYdr2SZmTQodGWdo7rV0naCaPHqb6BLHsz5U7qD4Mte1aa0Eu9qdE5dRSNdzQWX4VWiPuqfn/g01wnXtLkUJhZSisdcDCBwhxihszTlEJKmDTDOELITAzBMYS2faTA1KMEX+Cm+Ns5Jbb6SwVE+kvHCkc+26p4lTfaa6l/3OSwLO+/Xv3fcLNOZlbzv3VY/TrMgI8996B1xn5c4Sm7CLPhZa14rMdjZViXEiTSK/rZXblvkSJMA1j2B2Y6BLC4KM/0kmGd9qpdK3bdX8VJ9dpw+aEu4N+SWmST5ZUyiDTcCLTlXuLoEK6/DIelaeEkFVHhFD4PGq+VVT4j/5Uv/aEuzk9YsQ6Sr5Ul0SAfd8SJc9NIGaxJDduVWTV8ljtzhLyAiO4aYPbQs25FaRO89BJU+OzJKWJ39OXpqavGIcw3LvLNMZnpZkQea++yc4WqhVm6jj6RWy9MgUFZqzkNedcou27k78XQ/rVr6mnITynIvmpwXPTSCGuF09EoKEqoOccw6xqubJYXRVE5Fv3TbL400gwaw6M4z5t1Ond+XUudJZtRmtc4pP8zMLs4EAz790LqnTLn/7Hq9gKMPDXLXmZTNyCllZxnJCXwiTJq+c0SrBIW+sCdYI5cgTd7aVNOYKXsfcvsd2uhaXDSaFhfeCwVaBe8NaN8a7shfv4P9g+JlthzcLKiWWMRPeZdeJBrZQiaeKecivi3uuFNbdvwA/AfUgvZ0ngNrg7ERDiWoivGy8ZU8k0iBlINJUN+hNiEs/NeJRRgbOzyFPhSmlxnheWVYOFUPT23rryIQtuA4hawiXZi2DTUJxbxBhTaF7EmXzGL57DmhWMLlz3C8pKu2WBbrBExBEedxhgKrOe3yACtSPdHvV9BNJlze6WtJKy5woQBFbFnTYtRDPCteO2wwdIa6OAHQthEnTjdYFk2VbekgfKDwHA3TUISeNmx85jCkYeG3CB1MD9OgsPiR+HTk3HqLhSEDYh2xyVaadxnPTuiPLDo+Seq3lNqzjoSlUoj9cQdDh4NOsahb0s9xdTUkWQmAGCUzsO53BDo36rp0FnH1atc6oj2g299+pnI2nLcqWWGCtuSJcHDOEJEUDhz7w+AA31jGHVcicsD1iHFYpYW8YOeEGwQuGmYaw4KJZm9YKOljFH6CQh6NaoB5crLtrph5MoFB0TMzC/f8eHqjc3XxVs+2bEbLWtqIMavp4ZKU6PA3umHNjHTkibERqKHR7jleRu/uP3dU4H4lj2DBZRGtLOdUtfp6NpFnxCy5M1pe1VsGTongRdEnevWc3u+i3U/LFjiTjS6WXTKZNNWkd/ScIuDlZdy4ZhYi5GdCzGGSHuIOvizTUuF0TNDakb2gLhpwD1aBsSxe6Y0A2iTa7fHhoTERTgVwTSOIZavZrpJ56QPxRMG6rJBcEsxQSr0QA3wxnDKkBDlvOp0JTVrjK8yGx0jIyMuv5UYhU/VFZoW0Zfj23dzqJF6QKCSNZwHQYCMRc07G2BQMnjwgR/XHhBKMKjitjr5q8TaVRhLpMXxXaV+UFcrA11UyNY2xCdVJ1HAy6Wse5g7O25a+i3euQcDuFSBnZhADpRnxzbjmeqR/Hs2Pb1PjH0Q1X+UJp1Wy+Q2DGCUTuzPgUpANDSsAZ3+33dWFNDeili2nnfX4OSehcC6IJlZJlmZeFX4FtkLZZT45uG5Wzv4tswQhuCa4I1/quy+s1T+wEg6E+WaORUiHu4LU7V+qcDXwebtZTgd2lDKxi6uiJVp9E6+4ER/RNCV8C88a2cUv4SU+INRBk9kLGMZ8KZcEzzBYkmRFkQh6W+XjoAAvq881UnrKtxCx8qCJqidBRwUMmeYlUZvk4h3hKeFD4kygGAqW1hx67QMApJ4lsWObZiKT5o6qEy6QxzH62feQPEXAKmXCOpXqQQVudgf80LpKIbnAwGK/h4ElkkDhd9oWDTchWIkaMSU3zBakDgCbjKmt8HoQpBw/PkEEREJO1iaRnOpBQjhPi0iGhpgQdezFr1CTHWPlsiE761cMa9JaQknHMK8PlzGOnA3bnBkaPyqROlLjZ01acWBQMP1mpGQ/mNthS6gSfuJt90WdaNq36Sa3aL1VkFvSBnhOs564ziZSu49OR4E0j3EvCZDXLk40r5SEwLwkkVXle0s6+RokAjw5TxCeh0+IXvokbmSJMMgvNEhA95ZthSJziK3xAW4iPCEx0DtV1vw9j7+K4zIAHc3P/uMR3ocxXZENPOZIQalt3EjE/KpdzqHQDX4LcEc4VptG9MDRT516qpBwMTLW+udpmrvRZz2kszgwig3Gqt2Cqb6RSWetRSZM1qFxPiBULbB5/A2dPVVZePozWbQ3QrltLDIE2EgtjLPknqikOEuiIJAtUI+Oqk40aaN2cUedi4UYiONjmyqRRgWPDLNiXPRqInyi1btgsW6umvRME6nDb0Hy8EVKV7ZFfhISC19zxfhXbitYBaLD8OhSb+UK+YtO20pT+8/qYQWyl1/hQOeQv0WAqSUMdUJ9wpDYoSxq0YnZGbSC+9YSq5nVrutq2YLqJEqrI4ccFiGG1CQLZ4SilRO0OVS4igtX80qPUBiy8jNQPjW4reots0n8QEDBxtIumHIF2UcckZc+HpNBEOKTNi2WyhlTlrLIJdVv3YdHQxqKEymkSurkSaSnHg/iaDfMJgeEhELkzPHORngwXAfaYbXDuJh2c7nWO8XDVyzbQOn460VePeW5cXpMU6jfxi8kGwwy4j4aNFA6rxxMwFcwVGyl1wxU5K8LMMEjkiPwPNph/pTQQ0+XmF4mrjDQfEmNTdwl17ku4rSlkCAkFP7XJQkv+cBcxH1IT9Whqdl8Q7eKx0B+6gWaq+4Bsok2OIj6DiuEK2dBzZAcmbKoPd4hCWode5Aawd+NlgIWiuB8EyKhAVp9zSw+pCg6kzp5QDPiUyuFA4lFQxy5swu0hsmZ2OK60I15GkHW2YFUZcl4KasBYTBb9xOWmMoFChTD/plvurYwcLYvEEQ3WgkYgkjucK/QQ0aOQtCX7yjXFzMsgNlhzJ6EHEPE1Sc1K1KSixBNx8yEsaCOFxqIOFWhHJdcUTvihmYgh20LdOjQ+PN4dMqWUim1F0ttakBgkm6oKJVRZqOxsC3VUt9U3L52PvVDHcevTWhlGscjWklouZ2FneUpCYIYJOGXnEwIh0EmA9XXNLYrEBFbOqTqMLvtXlVk+bgDLnDSsphCVh1DTKGIHTWsEEN9A89MCo+hy3L+9e2eq3FZSHPgJuAe4E1hi5QIibKrsB+W8PuyKQs4eYRSJPrwgtILgDvFRjbR910OxJbUw26InLs1V0D65JuVF0IpN4mxQyOSLUzDALKUISLUjDaXSO1zcu9znT1LOKovFsJ2EGn8GFaXQqIk/2xJD/Ap0mJzligBqgYGkRey03pDKbDP82pTkUT3n0SE79JCek+TrQ3xUHzq/qZmqYy/VH/rDACUJoUpj+Un3VMc6NydBbK6+b26SqiDPuhoRWJxsw86mUhICITAZgaG9M2s+PZ5hwZXdh8BQEmCbXU+z3CwoGykWQADxx1Ab3C2iG3wMrrYteEQeR4u8Ct4L/1DCFtJrjImjgu1n7IkhoQFuA5faymXkaFwab4WT5K6Wm8cqYQJ3ArvsFo8Qz+KV4aepi3uawE+uCyXsPf+KK/4SJZ38+FpIEEqLlCHLWtNg3dZELbl1qDojiGNHBiJIJKRFylRrzlnlyyFE+EWaXRA9lls9Q4bMu1DnvnsdDfOpUIWtC00LWQiBEBhjAi6rODsN0FVWXdoNyWATzBqSiUg3ho4AuSCtmNDh0SFZiANxBELEzUH8MborVVOMRjU3rYjs8NZI/PQtTiRgQVKIU9icG0OIikApecFb47lwPCJN6ow7ZWSKlFzg0eH/0LjohmwVsRIuB9EHckQhmaWyEgG1pnBCcLqhXLOVBN1aR/JH60/BC5Ga1pJadsKqc1bbqlahU6sqktIIndZCy7W2rZH8DIFOAgKgLi18WyVJvP5fOqulJAT6IxDvTn/cslUITJsA9SOTQxBKLEnohLdm2k30vIGkVLcyiViJ1PS8USrOOgKCiSKV/IUyq2jrSiqaKQo6w5la6f9uHOM9nameZL9zQiDenTmhl21DYBwICDCJc/GO1N3Ic3VIcnJ5lconNFd3NK6Nc4/xe034FMdxGrKYpsww4piHksIQ6JzB0XHnyMT3byJ+ytk5gz3JrseSQIJZYzmtGdQwEnDPjvtoRMSkyMzt/g0kECBqJrIgrDa3e9vZPmsn9ViekAWpza23I3VW7r3Ejd+8a9KwPJposq3EHz0OgO13W5wH0kxWbRjK5VmTzm5Sq7vD+uiShK3jjjtOzrs08KYRzMkOak9MttqsuZBYJtNLINUdgpPNiGNGHRFMjcgbq4itTRRKR1NSsVrNcnbqv+x7dxdKX+NYspWDTba+tR5G1cdwskkIdCGQxwx2gZNVITBgAm7DngdaZ847zTi53JdA7d6uOW+tjxbkcUtOEtqQNVXPXeyjkbZNJJ7zZEi3qmfJtK1tfnpkM1XkBjQ1m8JhWBCdrCcBVmckhHn8oMR5T1SSFC8ttMpFprxw1NMKCLvKK5+w85LraT7zKyHdDYZu0Ktn50gy85Qp2D1xqiKhduHORI/ctCOPgjQv9YDKCZsVqLVtTVw9/kA1D8zUgrR9t+m5R89tdAo9DUg8V7zV0488j6CtNXc7tpXkZwjMKQEaPJ8QCIG5R8BN3R6077Erk900Pvd23XfL7JknKbNbPBwuyvtup78N4eJXcD8ak+whPV7U1bTD3jPzfA9NiVvesCVNOgvroX/8DVXZ84UpBoqn2daCSeHIkUqlMn9Ds4pikITe/JxwwQs4Rbs8XMeNeFgx3nqupnCMZxO7Ha/1kQR2LfXbg3w0+7a3vU232X4PBVCfk8MDC2Sm117cpe9OOlpEmrlEFoUOIeLAXEivcW+/CJRCaoYjhLPEnXfcLdqszb1LS7lCHj6vnqjCzm/PLfR8Rc8D1CWcwVEHQ54eSTPepMHZ41FSCj0agEZhaVT2NjG0eWs6G1TCReRhPJo1IrfkCI0pdPxwC8n6t0zFascjoyx79pLXi5JZxKUnHimpj3x/dURj/7cgf0eMwNDeiP6f+0XzCYEQmHsEXHk7fUujcdk93b0IplRAZ7obDqS+QIncamGFKVtjvOkGN6w1z7OZcpMuFSgDxDxuUR0OBo/2scA0suj16GcOGP6naoFFV9mH58zdcFXIkVCFvr21uwqZVT+FFOtnffNYKCQ7fJcZrvJe5I579CCyoQcGEj08JbYVC+MIISD4MPSTh0OhUVA5bjvSQ2vpGIUkBU1pwaeeZ2iBqXDX3r1l93ghSW2uTVKDv0053SP8VBWAqsL6Wd86U0854u+ZTJc09cX16J7mp+Hw39TPkh1UoJ/lOqLAmpoTLpBHstPaVpkpjqIqLEcdOeioJnE4gdzeaKd8Uc1WrfttCrMwQgSGVu4kmOVklU8IzEUC3PuMmUvztiQYjgqBg2bHLoI994/rokqc3Tw1x10zPAeMjYc7kxRN5bYFFsJ1s8f/eGaPc03dyuu6uQo1a19Eg+tprXliIbtLOpQO8wImF9+8FNIsmGF3xzSNCzToRvPTgk34DDgPPHhXDEKJCBEj58UCkjA8H1mSR2v9/pZ1hmvBLffekCDwxChqx7gqasNksqCeDqBQb6VD8SVw4XiVQT0OmAhgX6kc2ovzo56BpDJrTTPh3NorHaZs+Gnq3Rqtq6Zc9lQCjz7ihbIjD4qsx0zzM3nWAK3DY+TpzOBrh9zxRGMPbBQwIstIQ4Wtt+g3y56z7KmJ8ma4tegn8khNm3iDhClzCBksFVV940yqhdZvk6gFc+rg4W5pXTW3lx0GXmRRt1bZF4+R3CB9dox5/rWSkjukP0oeWMX7RRc6bIhFFwOt3Uswq5VGlgdCIKnKA8GYRsaQgGcGkgUuQ5lSYsVlNFeHQpLCad1zQXgL5Fdy3TNOTGxdUrNqzCfrRQ14wg17zOIyAJz83oolSYI18sRkjciQUI0V96wdgsa7FTnw5XKyDXWJzABohO1k5yy4Cmc+O0GzrNrUSY8xZNFVYMLV92BlxlJnPOyYp8QLB+TieHejngtYGAhLzO5SQnZEWBAZOiyMwnlQe2EyW+UO54d22CHZG/qvz5wiHpno9RfecE6dtOm5zq72WEJGEGp8ABwAsoZJPR4Rz7YW/ZECQpYBKDiiNRNEzdBD3vjBcFb7TKxYmNiQ2aHw6h0RtYqKaktn4Qu58MILKUtNeaxRjz1squFjyiq7tgrNKWHKJWZHnj1NtVR5a50q4bCxoAWbmIiSs44xs+xYclTwzYhMqWOMKoNslVkWfaucKtE3ecrmFxMHAC0lVEcgem6kqBNtJ7mn8X7VTpvveiyC/cLlaLEvmzt+HJDEotQl0tbLPRz5NtG4b3UcyXprd007rQsOVIcT75T5kssMrG5wpznwvBhE9I388l7S+l/QiN7S9x5cDoIpIIC0Zr8OJOFUUtItWuU/a91LlkOgTwKOs3xCIATaCPCXOGX7p+Lb50hgPisPgyFxClYuN4Ke4CnhaWAtmIqq6SxfKQsEgWr8Da5367qfCnHBzaLYF/tR73mgSJhGWzFpDB5zblmhOpwZFexgkJgZ5rOtk/WznP8sDf+Ni2bmqsrZEg4GLh/mkI1RKEuaE4Kx1DEN8sdombfAT5ZJBQaVEWICqwU6g21rglmcPWry7jDwLtCb1JZ6biFzLoTU5RUQ1WYv34SdXqnJdtJnUkks40kK6LZlqhF/C1SRntipWaAsq+dUo/lim1Vo+0iJpW+aQspSt9FGnlllj5tVZGIzwKawc4HYpUgcAM0qshUiLeNPalQcR7avDtPHZIo0neY9JISdJ1Y3j3kkZbRTr+Ii1KgNRw4alTojp4cSNQtETO1OkMuMwOJb+yCIClWJeXQg6V7TsbaF0hbmtzbXkwq20h/kiEb0UyzVVhx4JLLImkOXcHFEtTXV+tPmRudDrJTmq7XGSMQL/9VPB7P58jI12ULlELJ3U+ZjwSwrtMoB0Np4lkeCgFOQ48fHFdFQdfg/1xb5hEAIdBIQVvAfW3kSDJWrz6pDOjCxrUkqLIcXXVVKL5NZiQie2yYvwenbVmSHppzcGWZmtdqpRzM7oUvFJX3KVPMxiETUciN3OvvWVuI6nokVzSnfjLWshZt7vceKP4Pngw9AIY+Oml3kjjdUaGcyuePpLEYhd4SVpTNKQgmciVM4wZ100kkeKdQ8HtruyC+fyVRa2xBaf1J+BA0V4uE3XjsvBmQtuaN9nacaCSzOKoXkmgVeGaYR9sr2ZSwNXHCKUFDI9eW5ZypTA0Qh5wctxe5SnPxbjD0BQbLweHkfmaZ0WC4wk08zMfk1m63da5a9e4s85dszfWQN5laZ5XpkNvJcO56gXfWpBBKknD3qV6FIHOAGKNpFida+pMt4lQedhyodpqa4JMtx/PHHC7qhjXltzrEHiw9tJE6kkAfOoJSob9RNzarf+s19YiuzVptzPTZrwTEFpbkV6kM13lTIQgh0ITC0cifBLCfwfEJgAgKc8Aye/2rrJE/ssMMOLCjzpoSznRluthHQYfOoCiUuzSuqwtIIsrQ+ptaGLrjlW7jRhtFlvdSnnJg6gRumiy1k+cTFiCSmsWnfQnXDtXhrYS27StasPAlt0iJMqTiFMJN27MuNPLw7LtBVJlCMiAizTK+w9JqtlllHXiuhB+JAl1RwfzKRZy23lht/jN21vruN9FYwy1ofV/z0B63g2l3LhAI+1Ss2UoqJ/VInrRBqbfdveNl+H9V4pGgCC7SO9t2aZLAocZ8oVFPqDNkhpRc0zjOF0PGsEHZ8OVAQeZxkyplwdWQaeR29ctKQ+4RHynypUJ4VGghko+DSE07y07zbdsIPMcQrBqNq+qa+amKCJE5nffKXDlMOGrdHVeAdNH21XM4zy+7wojursL51z6e1pJa5psxIa7me+7SWTLbMG+Qz4VqeHp9mlQHWq9SbkiyEwEgS+M/ZLp8QCIEOAkwjAyZwwBnAfDY3+LgEd0XOJePDltvOhTJ7wJvC9gtp1S1FkhXcnCyaoJBacnYQH+E4oRXYZg4StyXTN3w8XBTiNbwUBAS3BAdJXVizeTInhFSYOoWHHHJIRx//U8BjJO7GPAs0uHGp4heyWyQYyW8VgKBjpGLII+HAsEw9+LY7twpTQkSAvlFFeq4+HaBNAQWV7Z1k0QG+q/ITcKKIa5B0PAdcL2r6NnY1dZJrpPEMWSWnR5ukgOVpfQDhWuPkINea6Bg/hH76xryL02JaO5r3lalSMGFxO/e833v2GALzgMDQenfmM/iRlGnpdAjMZQLstDwPwSb7YdHrphIRKz4bYYvaOc+HG68su8oXwiBldt99d5fvtVbAS64P34+LeM4JcoQ3QjuSS7gQRGR4SrRfLyEnDiRyuuDmdajNfYu/cD+w7lqgUZryAS6IYhBG+l+5SgNsebBNSX/hRtJJ0RzP4R1s42ltZgnwjfkHmdk+ZO+DIuBayLWc1viMh+qN6JE7g5ritDNuBMgd6ReyTYU8OC2EiozQ5QHvgrNzLbtThqenRu4WKnEWJfWzvm3IS9R5V05rnZldLrnDvSTnw61YM9uZLnvn6ZEjxdnGjVRxwy6Vs2pKAuZdTK0im1NWVkF0kuLkp6z0o1426bEOoe+xyyKtQ2Uae+x8qnUSGFq5E0HdOVkpCYH/EBA8onici911VVpHIa+MG8t5fXy4eRqtY5WkjTato5AAGmato4cG4nYhidhNEonCIfwIqxFkHFE9ah1RucqVmdtjcZBw0dld7YjA5eEjkT1ooNm1HCbOPIUyiprCzgVBTDfbS5TuXDXYEtLcs6DknkuN77FlXkaR1vJxNptwdrorrflZC24VdItfW2GXn9AZuIcudqmTVSEw5wT+k5OYTwiEQCcBCbZuEqZ1xJJka3ZWGI8Sib11n/x4DMco3Mck4CVfWHJV3WbfOjSBSDnXQieebjdZrm5r/SmXxSsJFE8fqCfE2KlUdMFBGsjT82pzqVoUhhwvCeldFA8xLV1MC1PudA4rkOZ77bWXrG39lEvUS2vSvzg73f3XWlng1VMJJKs1hbK+ZE9jK3HNeFUouU8VuXKQQW/XTWX61d1zyDQlky3Q4kQ5D5MOiCyTmDLbuFqV0JdmU/aY/LbJNk95CPyHwDxIXMouQiAExpsAI+dGqtab82dwvASNLB9W1u1Fnd1gg91o5tRXOd2dFbqUuNVLJlZbBfJFgyROazkZ0dxt3pSLH/GpND8nW5D7JaPZLeV0W5OUzQUitasKxVIn21Y5KWAiJMiTerwmNpROblu+LmvdA+hW+dqcz5ILSlflTcsSU2geVfaTh8YDEdx6Jr1MuVvTCRd6Qgp5bUtnEHk0jTqy4Ckn21rl1jmijRNOPjslLf6oUHRDoMq9/R69421iTQsez2OaqCjT0fn2iarmWwqIVHo6ybWH29ZAoH7cC2lHImtUlG3d18aF1myShRkkMLSpypE7M3hUZNchMAUBEQQGqW4In6LqPFnNoriTq21XTLKIGItVj8dtWzuDP92Vxth3dsAdcPwBngfYuaopcXc6/wf4NShBHK+XMkZPz3OPntvulKhsrVvV2OBmw2ZB6jqnTvPTgs7UQyZbCzuXm3dE6GQ9MEkdD5C0dx+ORo8b6NyqKZE7z/zvt99+MrE8DYguqQSdkneSn7Rf4sas8cR4iID7+NRxQ5zHE1AqWqAk6Bu37zWSUcqavXvyZLMjC9qn84yLxioRRk1qwRMNqhrXiwXuQ/c2WqhR0F6W3UtYTyykzLQ8mdxxdNGI9YBvW3nWIteRBY/bpjIt+LgJgPqx6/qZ75klMLRyJ7k7/zmD5BMCM06A+eTYb+sG8yMgUvdgt63q+6eMDWmh/W3uRnSX8p5kY3N+gkrZ9ixBxpWFY8CaZr0OjKljgdi2ptBZWKTGvfe+uQ3cZN6salvQMivISUNkMHi1Vn2v8pAgJQ/GJb5CD3V0wxpza9nbLQRc2tpp/cnWeiaNpGyba79LgFLfDJOwk1DiIT0akR9dLzgzF9wJxqVEOfcGf089qah1X9rneOjvDjIqx9szUCWtPBrAc4rFxYgVzh7agopqeypP634tWyuhmFtFV8V9DJOSaOpoUNJ3TQrxSknwTlE8nl3EGyS7yDONPONRH5AXeyq2NvdcJRqoaceC+BTZJDvNCy6ateUZao4EZCghTzIkjPTc46HrQUpEGK1jW+2YEQ9EaHuXWbMjTimP+qyaTaEFmXNN0rQjTbMDCU227iLLY0YgcmfMJjTDmRkCnpvMGcDIsc2+m04w8M7svpuSWlBHeVOTt99TdCVHW+vc7WauqsaRYIHxq5/1zRAyxixfUygHQiFPgMto1rEpb1sQxXAdzJYLKLigl56sD1WHu8JLr/gehC2qxFMQVWbzGCqBqhoCj4irfFf/njooPMFiqSzE41l8Mlca0+j+fG4DnZTcqlq9i1RNL7rSoJo8ItwJnlRU+2r7FsrRGe+4kKhB0Mj8VYG+EbAgqigehrneBsrQCnDYr4/n9JQGamut+cmxwax6sDIFwww3Kqqp0CwYoMcOEaDkppEql5MuR8eCpxaJ/njIYWWpS3zxbMPOdF0zK6mlNU+laXzKBQ4V08Gce5GZykQJgUWhcvLJEKIyaYsujajDfWVGLEirV7OLsGttR7OEhfmldVrLJ1vm6eFnquc2GW8dIbZFox4CqdDH5r4dD1SUgfB4wQK+uSvZ7Ugmf9u0VLNT6lZ0j9b0+AaF9lLdExoTLPPPolBTDvt6DHqzYRZCoJ2AoyefEAiBTgLMp7O2ZxO7FPbwfmfbek0EH4yfzB6fbbMV6eBfyxN0PIuZaHA2t4ovhIJhRbhDyBElHBskgkRaddRnVhW6/HXh66ezNiMqRdpzCJXTQFz0yllxP+ujHVkRnAqsqVCLwjL8NmQmXSi75GU8/rf6//dXz1nxCmFY8CwfcseHrKE/6BJhCA+2UWIzL2RggQRrPBOPaakRyf/1JEMZIUy+9I6KiZQ91nnRkNqf+31YKXIHMa/CMIR645gdMeTqeFSjzFMb/n/9+98f5QwnkhSQRDSKBR4C7RiCZfrPMJlAy1wglYzCDSD+oqT5gEwG1U9WWY4I1PVTHEQ0pKnZtqBlySImTnZLM8v1bjJHRVtls4BbWyEzbO9thTrTWdhWx0916AMLRkSpOGY4RcyvBR3DXGoOUdK5oRIbyhAyTDeZO7oQczCYOwdSjZcyVlhRNrJDtNRWJVnIBZnUDgNq0kFI3tFbUoBrR/WoaJKxdb+yeQg+q8hTj2ywSsu4ccsp5ONxYCssD5nZJ14dVB6ErdArLBzDUrPrVrsux61BmWXdVlkkruadu0uJfvovK0nKt9fatyzPFIH6/zU7kq5mqg8T7vc/YjmfEAiBTgK8C/5jmRkRAc4A5lw0QTUO/8qvJALqjVoKncHV5HtgxV2zusBly3kjPHiQnRb4EO5RjV2pq1iGmQ5gGFzaKqce2B7naypKlkNjpMv/0cgd1khnhEic8UkiV7csPQMjK9aO5Ip6Moo+d7GpnWZYaqpNNGIVE1JXzLokxiGCwCbpm/uKywBzzFA/1rZ9eEFUbuSOtfSfDS2UWSr/EAllQUxK6EH0pK2R5ie3gS65YacpsQAj21/5tvwuaJfp5abi8FCBtCph1GyFQ0PSTJk1ppHtV2juBKqamq0L1tqLcJWTNe+U4E6tLbkjgdcwqY0arNaoCprA3gmR2p1pEn4yRnJZUm1TaCv3+qlmwRHSutPWZYcNaSLh2nstHHi0o257trX3ljgSCBSxKhPdukmzbJgOD04+qpTmIIV1wFqCifIgC8hZz7pULtQly5heUciTRPuaYlqcBFGTW4gipIl9mw4RVWPEHDfZM6VENWuV+qbef4eDpLqhZajNoMHqvEL9J3SIFd5KTJrjhBfKP4sb5XyArc0n/LYJ56X/ryZBx6T7X/NBngbVjuNqwm1TOI8JRO7MY+DZXQjMKYHyPXgOimCNtgSYfNMrjA1vjUK2360lZcwEGkRVWnfp8poZELuRS1HhGNKEDSN3XOZWTW9dKFNtc+aheU9F0w67rryRO5XcU84bOokF4k9SmZbiCrLAQqvfRe4wP0bUtG+h5A5PjEe9GRHvSzlyrNpnn33YNgKI36gKDYQhZPhbW7DcRe4QUlSdplTj1sKE/4Ddamuh9WedLmWZKBRco0tkhIAgOYOXiOiUcUL21SY0okxhDRoX28yRQEmYFAuUmYBOczuVyQKH3ip/GMcAN1Xrfmu5MmNoBbqTGqieW0XE8IFpwYfXR2qRQmmz9xb894stV8jh0VpIQyisfGHCl3NOP/lgFE74kUnDWWIuCG6yqYSmAJ+Yjmky0Qx8FXZuThbwstAW5IsoZCOq1HfsUZAq8P0YeFFqColdrWnZx0itrQXflnWDbLKhZY3YqnatWf8X9Wm61BRWm601m0Ors+cpGRsCQyt3Fmz9t8xyCIRAQ4Bhtsy6Uw8WWEffLjFlvLIHzIZzN6eO61rlE35cmqtvlUQKjXDXsxbuMfEazqrPXeEcZ5kh8V2On1rV5Zt7wFoKwJUuG181q7ddtqpVghqskW7Yl5iF0VVihxI2kn+IM0YFkoIm4FvyknA/622mWiB3VHAdz48iZ4We4LBhOxljA9EllPiZ9FCWj2t6V+3WEh/Emc2lejCBXpbJZUJmsfp1E3JbtzWrfcqM+0HwTp+pOh4IXSI+ODBoRL2trcgXQROy0r3QdiRF109ah0OCG0PYi8EWWSMy6B4JUsQTuUmEkbOUXNuu/SRxnK+NWmv8K+5Oqjo8FvrPZeWnqax3evCxsf0Yqu94qEKpUcSu2cdEZFCE0SYcNkQDnWSmtNz6itlqv/nmV2uWmwUTUZkuTcmEC4ZMoVpVvsmmjunmhml+EqC13Hb0TvYIR46ZZtvWZc3W/0Wz1kLvha1bZTkE5jaB+epsO7d3k/ZDYEQJuJ4WNCnnvCHwIsg7YVw5ZoQk2DBPH7FWEq4rcmaSI4Srn1nldWDLXWS7TKdLKAm3DrEuSsgC0Sibu2VGUoIbhaRNsLJMsoxOykOCAgPPiyN/wjPc7I5h1iadZO+8BRohF9hXWUSkEtFDuHDMuPKudgQU6ubwTuzCTISOPgvZWOZ1MEDpIH4KozDGbDx1opAzSbCAoBFo03lmTGticBSMZf3Uf0ETAS8uB2ZPf2gIASCuF9X8ZHTpQo1Us9KEWXqt4WBo7pPSzwk1hx1xn9BbDDO3k9HVQGSk2hHIneNqLSEyqI3WkiyHQAjMGwLOBsP5zqz7kDv5hEAITEiAJ8BtIPIVSI1KkOS3dyM3C81JIzDhwp0Pw7asuOALK67QJb4btRSy1uJNFIw0Dq9DV8IZwFRTFaI/9Ap9Y7myX20iKCNe486jymuWJuwnOaJcykXlNWvE2YRvgFCgJPwkdygD2ohGIUE06yf/k1UTfgzBrch0Q2V1VB0b6kZr9EE5v0iJqgnbmW6hPXIM6KQN7drYSa4a6XSbSv0QCIGhJTC0wax4d+aN3s1eRo8AD4GYCBeFD6eF/M0mdYNY4Q7hPmlefm54xAd9w3VB94zeaOd+j4VyOKWk3ELkZngaTnJJBXrm/s6zhxAIgXlEYGi9O5E78+gIyG5CIAQ4daT1uCoV2xKfSrwph0QIjB+BoZU7CW+P38GWEYXAkBIQBPQZ0s6lWyEQAmNNoKenZ441gQwuBEKgVwKCUO456rX2JPXczO/hjZOsTHEIhEAIzBUCkTtzBWsaDYFhICCLWY5RPa1nzvvj4XUesSgJek6acgub+8vcoD4njWTbEAiBEJgugcid6RJL/RAYGQKe0eee8LZHsPTde7e4u1NMBnffLUj6dvu9G909J7DvRrJhCIRACPRBIHKnD2jZJASGjoCnAHtHaVu33K/utnnPAWorn/CnhwN5J4D7pyZcq9CNVB6n4cH/Xt7kXlOP12tqemqfJyZLUWx8P/KRPUzInWsezecW9KrpIdHurhfJIpuabS24P98zijy7zwOXW8uzHAIhEAKDIpBU5UGRTDvjSYDZ9kwdT9VrHkQrQuTGIo+oMWD3FnW/vci2bkfysFqbe/auZR4OPz2hxyqNK9e4Qj89/EZrHk5jbT3Tz9tJPcbQY3u8h1KJOl4T4eUG6vCyeI5wPc6L+JBV4z1KHjNoWy8xeOlLX0q7eNmFb0/oqfu97U63vXHCcwI9U8cTDptnDHp2H7Fy7rnn8rt4i0I9annCGa2H9lpFynjool7VIxbt0Qu/vL3LcxGt9eRlNXXeE5n1wVOLFNqpJxi5Xd+7J5vHInvYsXY8sNHo3OE/4U5TGAIhEAJzSCDenTkEmM3HmUC92tDT9z0t0HsSDJWw8DYDjxj2NgCag8PD2wMmQ+CdhZ477LUG3tpYL22mVDx10EOTbeL9i16PUG+EIBG8UcHLkrxuQvvlOPEySEKHRqFjFJJZNIT3anGQeAiQmt6OSaNoimrxngSSyHMRPbnYc40VelGDtyVYqDoWvHtBNe/E9moFusRjoHllPLKZuDGuM8880y48BtBTjFWe7MO1w4VDmQFiX6rZyhsbPNPZS6l0tTZUx3vCvQ7Ceyfstwq9kMHTqCk88q5p3+ae5eiNY7w7NFNTnoUQCIEQGCCByJ0BwkxT40aAy4QvhEPF+6G8CcHweF8Y8i9/+ctcGt7RTZeUDOocOSvuBQt0iUiN7BneC2krHB5eAeHN0nwbPD3eUOGZe7aVTUyseK+CNzY0TanAAUPc0B+El5iUn14u4d0LXimlZY9v5hpRn2+GU4fTiGggreiGKmzevVBtUmYeqUx+ET3yhT0A2h5tS8/Z3DOOywXFB9P0oXPBtmqSejQK0aMCP5O4lYcrGpp2ahOvpzBqr6+iqyiqqmkVqUfnceQ0LROCEHnphLfE009G3azKQgiEQAgMikDkzqBIpp0xJPDc5z7XK7Xd3MQ/UdEl1po+IAvIBX4a2kKoaMKRkzteJ0mjeGk2ZwaHkNc3quntE2I3FADdwMwrEYHi16lXUfLT+FkvzuR0oXI4TqTLcIpQORPuaLqF2q9NqBbKY1qbUzY1XkC8otyIZPzom1dheHOFhyZ7NRiNqE2pPKuvvjqBRcFwU3mHRu3IWyO8mKKJAHpLvHjcj3/8Yx4pKczqnHXWWVXTi73e9773Na6pKsx3CIRACPRHILk7/XHLVuNPgAeCe4b/RgYMu175trwXJA4jXeOXCuM91V4COiEOLXiDt3ddUTaCVqSAh+zxncgp5mKR6UJLeT+oNrlhyAJpLrSUF4iqL5mXVqAqZLpo3N5pAgtUlxYk2dhK35qXZdJGl1xyiVxgjiLvDOf14SbRlE1U430hobx4i/fIKtKKjKA86mXvRJXKXoIhJ8ljdYyOy0rlzkHZCwly2GGHqSP6xjFDvblB3WvSxeMqRMWD5QVeYl7uNucYsxe5RHV3GJ2nGrFoXOJWNkTYy7NwUAgXpxcRWfslpEok0YudPUlJCIRACEyLQOTOtHCl8iwiIGnmjDPOIEpoFDcTCWDRBGI00n4FmCTV1m1KknA5NrwBqg0N34k4zkEHHUQSseWerSfRRw6QaBQHifgUUUIucB3J55U6Q4LwGK2xxhqUBH8JuePGb3rFLVcSgLhVKt1YCMkLvDRF3BBSu+yyS+2XhCJ03AblJ2XDhaNjF198MfeSgXhxOgcMoSZH2FurvvGNb2hQPI6XhQ5zAxdxozKxxUNjF3wzE8qdZz3rWeJTek51SQyidezOgpaNl2LTpkLq8D3veQ+XlV2r6VOdFK7y8VN5s6r0XFVo/Uaje1ittXKWQyAEQqA7gbwzqzufrJ29BKgKeoXlFpaSdEyyCCeRL9wS7mxitkkfrg7ZxzREZQe3wZIo4zXj4lYHHnggoUMTsPEcJ/SEdBmCQ8t8KqUV+H64YThI/JTcU02RSiJiVAiNUiX2xZWiWdqrLTWnbe/5GQIhEALzngDPcd0xyrk7VI+WiNyZ9wdD9hgC/ROQ0ssbxD3Dy9J/K9kyBEIgBOYOgaGVO0lVnjsTnlZDYO4QcHe3fJdjjz12zt9dNXc6mFZDIARCYBgJJHdnGGclfQqByQg8897PZGtTHgIhEAIhMCGBeHcmxJLCEAiBEAiBEAiB8SEQuTM+c5mRhEAIhEAIhEAITEggcmdCLCkMgRAIgRAIgRAYHwKRO+MzlxlJCIRACIRACITAhAQidybEksIQCIEQCIEQCIHxIRC5Mz5zmZGEQAiEQAiEQAhMSCByZ0IsKQyBEAiBEAiBEBgfApE74zOXGUkIhEAIhEAIhMCEBCJ3JsSSwhAIgRAIgRAIgfEhELkzPnOZkYRACIRACIRACExIIHJnQiwpDIEQCIEQCIEQGB8CkTvjM5cZSQiEQAiEQAiEwIQEIncmxJLCEAiBEAiBEAiB8SEQuTM+c5mRhEAIhEAIhEAITEggcmdCLCkMgRAIgRAIgRAYHwKRO+MzlxlJCIRACIRACITAhAQidybEksIQCIEQCIEQCIHxIRC5Mz5zmZGEQAiEQAiEQAhMSCByZ0IsKQyBEAiBEAiBEBgfApE74zOXGUkIhEAIhEAIhMCEBCJ3JsSSwhAIgRAIgRAIgfEhELkzPnOZkYRACIRACIRACExIIHJnQiwpDIEQCIEQCIEQGB8CkTvjM5cZSQiEQAiEQAiEwIQEIncmxJLCEAiBEAiBEAiB8SEQuTM+c5mRhEAIhEB3Ah/+8IePOOKI7nWyNgTGksCCYzmqDCoEQiAEQqCTwGtf+9r5589VbieYlIw/gcid8Z/jjDAEQiAEisASSywRFCEwOwlE5s/Oec+oQyAEQiAEQmAWEYjcmUWTnaGGQAiEQAiEwOwkELkzO+c9ow6BEAiBEAiBWUQgcmcWTXaGGgIhEAIhEAKzk0Dkzuyc94w6BEIgBEIgBGYRgcidWTTZGWoIhEAIhEAIzE4CkTuzc94z6hAIgRAIgRCYRQQid2bRZGeoIRACIRACITA7CUTuzM55z6hDIARCIARCYBYRiNyZRZOdoYZACIRACITA7CQQuTM75z2jDoEQCIEQCIFZRCByZxZNdoYaAiEQAiEQArOTQOTO7Jz3jDoEQiAEQiAEZhGByJ1ZNNkZagiEQAiEQAjMTgKRO7Nz3jPqEAiBEAiBEJhFBCJ3ZtFkZ6ghEAIhEAIhMDsJRO7MznnPqEMgBEIgBEJgFhGI3JlFk52hhkAIhEAIhMDsJBC5MzvnPaMOgRAIgRAIgVlEIHJnFk12hhoCIRACIRACs5NA5M7snPeMOgRCIARCIARmEYHInVk02RlqCIRACIRACMxOApE7s3PeM+oQCIEQCIEQmEUEIndm0WRnqCEQAiEw7wn89Kc/ffrTn/7Zz362dv2a17xmm222+dCHPjTve5I9zmYCkTuzefYz9hAIgRCY6wQWWWSRX/7yl7vsssuvfvUrO/vXv/713e9+d67vNTsIgf+fQOTO/88jv0IgBIaGwF133fWXv/zlsssuG5oe9d+RAw888GlPe9omm2zy85///Nxzz33KU56y1VZbXXLJJf23ODpbbrTRRo9//OP196CDDvL92te+dr755tt5551rBH//+99///vfX3TRRTfeeOPojCk9HT0CkTujN2fpcQi0Evje9753++23t5bMyPJtt932wQ9+8OMf//gA93744Yc/6EEPevjDH843MMBmZ6Spvffem5lfbLHFHvvYx26wwQZ33333iiuu+IAHPKA6c8+9n7aOKVPNd1v5KP68+eabxbBuueWWX/ziFwsvvLAhzD///Ao/8IEPPOtZz/rqV7/60Ic+VIVRHFr6PCoEIndGZabSzxCYmMCnPvUp1sI6rgLKwIdllS0xce25VsqSveMd7/j6178+qD0ce+yxb3/7288777zPfe5zW2655ZVXXjmolmeqnec973m0y0c/+tHvf//7N9xwwwEHHFA9OfLII6W2POMZzyDvmr4dccQRW2yxBReINBderqZ8dBfomz322IO4caAueO/nd7/73dve9rb3vve973//+2X2fOMb34BldAeYng85gQWHvH/p3uwk4KLWx/nRZwYJ/OY3v6EkmKWFFlqIoXr3u99900037bfffq7RZ7BXrbs+++yzr7322le+8pWXX345u/g///M/q6222oknnrj55pu3VrOMpwiCj4H4WGBEGZ2m2r3I7/azKbzzzjuPPvpoZlj9ffbZ581vfvNKK60EhW3VgaIWbOJ6/bDDDrvjjjv233//pkEL//73v9/61rcqX3TRRd/1rnfxZyiUurHXXnuRRy9+8Yuf+cxn2ovGr776aobwwgsvxPxRj3qUrdZZZx2GcN111+W7Mkbeo9aWR3TZ4bT11ls/4hGP+Pa3v73KKqsYxTHHHLPvvvty0QFL9AD78pe/HHYkjzvuOMM/9dRT8R/R8bZ220Q/6UlPIsc/9rGPGabx+ldSYdVVV/VN0T7ucY8bBj9la5+zPE4EZtKWjBPHjGWwBFzmOhsONjLSRw+dmk8//XS22bYuQN/3vvex93NV6/zwhz88/vjjm5P+T37yE5aP/19yg4WTTjqp1fIRHLQIFaJ722677TnnnPO6171u1113lRO6/PLL//nPf7YJRwJZ8+tf//oNb3jDIYcccvHFFxNtLql9Ex8qXHfddUVmzz33xFx4xb6qRHBhu+2223HHHZ/znOd85jOfkXfyxje+cfXVV3/IQx5CwZgj5nmttdYiuWTYMNi2IoBqW99kDUHzxz/+8U9/+tOnP/1pPxWSYjYha84///znP//5+sPySexwZf+iF72I7ZfJ8Y9//EPN9dZbT/Tnb3/720tf+lJaao011mhaHt0F8/LsZz/7mmuuacJYUnmkrSBA5BlsuTee+tSnCng5/KT4LL744hCN7pCbnpeP6jvf+c7SSy9N+pDXBmjtb3/7W98O1z/84Q/jMdJmyFkYLgLOnvlMScDFpavPf/7znwyMf1pnK8u33nrrlBuOTQXmzaidofobkTsyXvjCF/ZOjAFYeeWVBUcm3B01gL+PXvEfCHPom3lRmQ/gqquu4upwPvVTTRMnBdLy9ddfr2b1wfKXvvQlIoANZtG/+MUvTrgjhfQEiXPyySezvv51v/CFLyi0Ly3svvvuvCmkg5If/ehHT3jCE1hlooF5Fk5SWB+FSjhgXMT/b9k9QhUKhSoIAptUbzXOti2wwAKSYKrmE5/4RDtdYYUVHvjAB1ogFGqYtfYjH/kI79dpp53mp4gPFwtNQ77omxI6o/wH4PzgBz+wOZlCZOiwZbEDHaNOXvKSl6hMkXA5CEUJtbDElJNC7VQHTAQrpRGFZB/BR5FYfuc732krCz5IslUVr6kSuo1TB39zBLKwlHKGXPvGa+64owiaqsznoVdmpH7Wtzbl7nzyk59sLRz1Zcftq1/96mYU5TPj7HGz0je/+U24HMOO20svvVQ474QTTnjYwx528MEHN/VHdMH/0TLLLEOL6z8Rv9xyyzkqLJPUlsl03h0oRnR06XYrAf5I/8s+rm1ay2d8+T4z3oOR6MBXvvIVF77mb7fddnP1ee9U3ufzn//8SHR+IJ3k2EDgwx/+MNvD4dG0yRvB6LoqbUrcR6OOj4UqdP3q/M6K25zD5mc/+5lyyunLX/6ynz5HHXVU1WTL5TFwPDg58iKwpk2zrQvNv9OPf/xjjd///vdntiUEqOPsSTHIBn3LW96i5JRTTjFZVIVVJAtBINPFMrXhepoC8/2KV7zCViWJWvdSy+RIXYM+5jGPqZCNcqZIs+SOnlv41re+5QqVq8My+60pMRouFjVdrOvMmWeeSWYttdRS7JlCvg01ScArrrjiZS97GQ2t0OGksDRB5TEoZPmsJSn4dRBr1YtU3ZOf/GRHo2r1ER5iMOgD7SCj8IwzzrBc0gRnuk1hpfXY3LK5E/yyQJ+Rbl/72tfcRGMTJlahzwte8AI3ENVy8y299FWvepXOI9NMPWmCDynTVLNAwJVg4nySl0riLLnkkmxb1TFHj3zkI2vZWLiRWre1zCIaex0wbatG9Ke5cIsWZeyUUsqV0KFZHYE8bbAL4Tl+KGDeLP8gPnJ4S2SP6JCr25/4xCccSyVoDNyxWuWEryOcyHP8tEr5kR7sLO98c36O3BnVI8FFrYvXska87q62uXymOxib8N532Yo9cJPqBRdcoKazgGV7FFywiQs+zl7Jfby+zgv1s2w8q+BynPnp0jLzJkNi/fXXdzdsY6KIFSXyA7g67I7t5MNgdZyVNt54Y6cndl2bruw54VlBbgY2Wzu1IwaSKHFxJlPECUshR4Lw/H3v/fDDVzXyhdZZc801uQfY/ve85z3KiSGtuWwF1pm9YOrA/e53P1kL4iPM8GTeHV4HV70kjm9N6X9ZSn6LzTbbDCVGhQairqxlv81X9YQxZuBrWXRmiSWWaFBUYec3f4lOGrsMHs4VThR1nKNdeTPDnCKAlHPIXKhG96hQqRiQWqY8qKvSTCV31LRrnac8dthhh9qpxAWbi++QhtSeU0aVkzhNnSqpb1PGAVa7UOK4YjAsyH0RpaLzLMOozZI7wkmdcocGkjmhJizUm6gZ+aiF5jzlGDDLKrR+7Gjttdd25NArTXmn3KEFCUEqhyHXDaxUNlnGzv3GC0UzGWm1QFQ5ahzqBGLTpgUHGwXcWjLSyybX/4sD3hyZwRoLTeP/i/PDfdqlXwUH/W8CyCtGFs/4kB1m/IJkevePE5eZnfHepgMzS2Bo5c44hISdSefBx/ndxWvFFJyt/PMzzPbrlERqML0suiQPLgSX8oQRp70rV0l5pAmbYVmkgGtBiIEFdQ0niDBht53fmVJJmu5iEBpwGWSBuGGeCR0dkDPh/CgUwtXEWjsBMf9iAex62ZXOZtkw/SE1dEOypKRR14su/V1nyw/VWxf3YiIsIqtjCBwVb3rTm5gZW/HlyDZgiqgH/naVm/ZduGuK20NWB03jZK0+HcPkEwQkCPXGJ0TfuGzVSc361OYGiJhC/5ksnEes0hPuxNlwww0RUAde/oxmX60LFIZ+Pve5z3VlbEHIRnanCgyqrSQ9WCZ3BFNcJVs7YTv2y+6Sa60tdy6z1uJQWnCRXbpHHYkm5IV5V84SmP3ODZV4uhoZYTYpBoGbkq3KHT/wbr/99maWH6h1WxAe/OAHa1kWy6abbjpZpjY5wt/G5cOVVZvTDYQLCWWKSVX8lVfHzBplSXkQow5OvbLKjrAy6Q5OP21FjDKxjjQ/6TDBBWJXdEkj5sVcN12VeyHRxMEm8qWyjyPTse1QJ2JUJivZdfqJijWbEKkDuG/S2XFC1dF8qhmIQsew/wvVZCjrgBRdx5tyx5J+1vHg5xh8HEIyqAwEjeaw9L/s0zo6wtpHCaXeWj5Ty04CDrBe9u5cYYp7qZk6ITCPCfyf6ZrHOx653TnpOxEzD9wYbJggSA3BJRrd4CzGsrq4oRgYide//vXMJPeGqzRea3ZLVEJNQQpXdcIc9O+EcoeYEFBgyJ36GWN7IReYDVpHpgWNwvRSXSyN8yOJw0WsGxwhLBxhNBlVCsa1I7PH1DEkrvLVVGhQAhOWCQUqSrcNjS+BslHIN0AxSKNx6cl82l2r1uFMohXEg0RP2H6G1sAZLd4d0tDmDHZTX3mbsKB1WF96BRniz0W/Tab1AYqniocDYU6dtm21zzdmFjTOZps73KiHv/71r201p/xp7MioZqE2N4+YQMelQagZnbX1DZf4i2X78iEyDJx0IOmYeVfqZgoWOpg4Vm6izzrrLBMt/Mei0IjUISFLoJCSdACdoY65EwEhQB0VLCWTqQMUWNN5bjnACQhrOYSoSasoUULWkWPZvGBFctWhSyrVnJpxa/nnyLLK5qEgDz30UIXcQuSU/BtK3fHTyB2HjdAbIVITrabeCld5Og4lZEEnFfq/oHd1HgeOnLpUkD/kn6W8pJSo2JaaHB6ONAdnHcPNYeOfRU1+QXVm/MPRwmeGRveeGDLBWmHEzprN0DB0iHZWGM4SB4//JvPYvXsEnEuR7nWyNgRmikDkTq/knZ6YB7Ekp3LODHa0tmTvXac60TtlVyyGy8TlqUscJscJ3cmdVaN1XMjy/5MvPEPKJ9uxPFB2kbtCZIGdJiPUdA517U7rWHb35nSvd5lAzfrYXDimLiuNyM/61DLL9L8F//krNuTy3bZVaPgWfNucdGAmxWhUcKHvYp25IuAoOSGeqq/NnXbaSW9ZWaE0F4j8GfI0RcqIQi0wbzQWxw9QZSBpOC2owPpyGBBDlBlnWDXY9o0k1PizxLWKUNPn8io5NdOUrDU5ghslwW9k1jghKANagVoiC+gYco2Xpa1xPzGXXMwxRsgaBZcVPhIv6CcC1JCJTsLU7jjw1a9vFfjkLBudFrg6CCCJzCogpgUKmDigOfRBnEsPzYhuaMddVLI4SSJ9Lm3hoDLXWMFOsaEHlGCTo8sR1dpnjXCtmRo7NSMkV60lSlqrWebg8U2460CzymQJbPk0JRaoJfKotaRZJmt8mp9UC8VpipsSC40Yai20zMy3CjUly977aavmJ3HQhw7ubGcgJRwt/KONXpmsTVNgNJOtHdFy10jOXSPa+XQ7BP5LgDHLpxcCLtNZJq4IlZsUS54eQQpqprJYXANVUy5J8WVEKYxK1uGYYZVZfZKI0XIV232ndfMzC10ptC79nXHoJFu5xCQLLFAYPN6sKStegSqhhAmbZR2Zf7dFcMlINGGJeW5czTP8VvEcyCxhv20r8cUoFDpr1+08HEjKGWYtiNxZa2i8QZwQFhh+F99lA2hB4QxAGGlCR2Fzh5FVoksECksgKZVAZPiNguFn+QCsZEymmmbisyFiIFXOtE84oiqkD/hOinCV6LkeMr2VQ1OFJVzQM3w3Z5kFH3FAy0RVk1JdlZtvBLRsLYFCanDk8HZowbZW+bY58g6Jmk1jV1JJSAqrWtNa64LJarJehPMYkgmzc1o3aV1+9KMfTdW1lkxrueQOKaOH09owlUMgBEJgSgJDm7vznxByPr0QoBJol0rdlWHARNmKk58KqUt2DhvZM8whW+hT4Q/36bCCaipn5rmFXLsLZEiY4Dbosl/WmiihcqqO/VIk1A9/Rt2nU+XaZLc8FoUPWU+4TyZrkwJQwTfBwTFQmcVCEi616RgjEn+xLecEmUKZMeSUB78C46qcFRczshfOBpKCkaZ4SEA+Eum3wiiWCQI1mXyqSIYQv5SfzYezSmUCpSmxwB9AeLWWTGtZei/Xy7Q2GYbK+HBaSGPic+KSEbJxVPTeMTeLgd97/baaYoj8UiZoyjTttg3zMwRCIASmJDC0cmc+XXeNnk93Ai7u5ceQCKpx73OrCFII5ZAvMipIGY59IQzXzTJ/uT0II4pHBTGpJkJPB1AV3P4CE5w38lWbVZ17d7+SduppLrXWVnwGekIG8TM1hSIy9lUpBdbSNK1RqqZlHg7VfNucl6XKzb4SyyQO0WOBOPOxUAEUm2itqa/9pma1MCPfxB+BJeVFf4itCcc7Ix3rfafidKWV3YbGxdXlSOhs05FgFjrLUxICIRACM05AoFw2gm7I3+Chn/H+NB2I3GlQdFsgC2TnlGLg7eDGEEJyge663HW2oAC3h3tbREbqqSfd2uptnRQWmc5110xvW8yiWjxP8kZNCq0zZeroLOKSoYZACITATBOI3JnpGRiR/TPk7oqS2uImL8Ggum9lRPqeboZACIRACMx2AkMrd+ISH65DU6TJgyv4LYS6onWGa27SmxAIgRAIgZElELkzXFMnh6O5hXi4epbehEAIhEAIhMDIEvhPdmo+IRACIRACIRACITDGBCJ3xnhyM7QQCIEQCIEQCIH/EIjcyXEQAiEQAiEQAiEw5gQid8Z8gjO8EAiBEAiBEAiByJ0cAyEQAiEQAiEQAmNOIHJnzCc4wwuBEAiBEAiBEIjcyTEQAiEQAiEQAiEw5gQid8Z8gjO8EAiBEAiBEAiByJ0cAyEQAiEQAiEQAmNOIHJnzCc4wwuBEAiBEAiBEIjcyTEQAiEQAiEQAiEw5gQid8Z8gjO8EAiBEAiBEAiByJ0cAyEQAiEQAiEQAmNOIHJnzCc4wwuBEAiBEAiBEIjcyTEQAiEQAiEQAiEw5gQid8Z8gjO8EAiBEAiBEAiByJ0cAyEQAiEQAiEQAmNOIHJnzCc4wwuBEAiBEAiBEIjcyTEQAiEQAiEQAiEw5gQid8Z8gjO8EAiBEAiBEAiBBYMgBEIgBEJgXAnccMMNCy200KKLLjojA7z66qsXXHDBhRde+L73ve+MdCA7DYGGQLw7DYoshEAIhMBoEPje9773uc997rrrruvS3VtvvfXLX/7yuuuu+5znPOezn/3seeed16XyXFr1jne8Y9lll33Na14zl9pPsyHQO4F4d3pnlZohEAIhMBQEDjnkkOOPP/6JT3ziMsssM1mH6JtvfvObX//612+66aZbbrllHvhX+HLuuuuulVZaqenSpz/96Ysuuuhf//pXU5KFEJgpApE7M0U++w2BEBhJAj/84Q8f9ahHcVrMYO8/9rGPkTvdO/CWt7xl1VVXfdKTntRW7e6771Yy//z/de3//ve//9rXvrbllls+7GEP++AHP7jBBhu87GUv+/CHPywCtfvuu19xxRUHH3zwfPPN98IXvvChD32oDX/yk5/Y9fOf//yNN96YijrooINuvPHGrbfe+oILLrD5WmutdcIJJ6h2xBFH/PnPf15iiSWuvPLKBz7wgW19yM8QmPcEEsya98yzxxCYmMBHP/pRV8YPetCD9thjj4lrDHGpy/pf/OIX//73v/vu4xe+8IWf//znfW8+hxteeumlP/3pT//5z3+2tvP3v//9Z/d+/vKXv1T5d77zHab9la985VlnnXX66ac3fotLLrnklFNO0f977rmnam6++ebrrbfexRdfrHz11Vf/yle+0tpy6zK5QEOIOj3ykY+0N6ve//73L7fcch/4wAce8YhHOCT23HPP0ijiUy996Utf/OIX77XXXq0ttC3feeedb3rTm/7whz8Y0c477/yKV7ziqquuUufcc8/dbrvtqJltt932bW97mxKzZl8OvG222Ua1b33rWwalcJ999ll88cWNbrHFFjOn73vf+3hufvOb32y00Ub777//yiuvLD5luq39n//5n1//+tdPfvKTv/3tb3MmcTtp9qtf/eq73vWu5z73uauttpqdSt9p62F+hsAMEPDPmU8IjBwBJ3Sf6jZLcMcddzgp33777SM3kNYOM5avfvWrnQWe97zntZYP5zLs7GLTt3/84x967rr/tttuk1Piu1apJlv25ptvbmpaqDqthddee+3jH/94MoLDwObdp1Lmyvbbb7/jjjvutNNOv/rVrzR4wAEHHH300RYEbojFKvRTTfrgta99rcqsshIfNVnr3XbbDe29995bCZVDaug/s33kkUfeW+seEuHhD3+4Qh+7q8IDDzzQT9Lkve9972Me8xjOD+V/+9vf6BK64XGPe9yhhx5aNembTTfd1KCkCdvq8MMPr/LO7x/96Efa5CbRggXuFv3fYYcdOGCU0A0KSQ0bcrEQKxb++te/KiRoOltrSggRCqb5WfGs173udUrMyGabbUaI1FqeGP0kbuqnf6UFFliAVPrIRz5C/ZgL9GqV+TVrxkuUP/OZz6xC3aDtiLb66dt4dbt+EmfPeMYzmlVZGHsCp556qoPT58QTTxyqwUZ017zke8QIuGZ1wX3ccccttdRSchQ22WQTJ1zeew72ERtJS3eZT59rrrmGRWkpvg9jfNpppwkTbLjhhq3l7Ld4AQu9/PLLVzkLzUoJPcDylKc8pQpFE2y+yCKLsL5dQjB0CT/E9ddfL4pxv/vdT2WbEzQ/+MEPFLpA52bAmcEjNF3WC15o9q1vfSsbr6Yre99yNcoistbVAW4GNQ3qjW98I9dCtUkT6DwTbvMXvehFCiXe8pc4OQqgGP4uu+zyhCc8QXnnx6RLgF1llVVoXEJhq6220uH3vOc92udOOOaYY+gGxvvRj340z4qYCyHC9cKW00Za05m3v/3t3Bj0CocEb41CqohOYsW5T97whjfw38iJsYry+NOf/rT00ktrtnry+te/HqUPfehD4lm6irO9PPWpT7U7mTQ8IhwbRMmSSy75kpe85KSTTjJB/CLyZkxKtTDZt4gPztYal9ae/exnr7/++m9+85uV6BV9dvbZZ5sLoSUlQE3WTlOuG2eeeWbz04KJe/rTn25BjMkuqL0zzjjDT4P1j9OkATlUQCN0LrzwQuJPeKsOsApsHXvssQ5I4ypxZvNf/vKXjoEmN0hv4XrVq15lVT4hMEQEhkp8jVZnnB9br26HqvP65mzr1Oak6UQ5j/vmNhChgbm6U85//0VMrL2QPpaZtGaP7CWb0fx0/nWF6uqW4fHdlDv7O3GzMU2JBT8VMs++XQTXKsaSVbO5ppRYeNaznuXqmdUUfWBcZS20NtJ9GRzbCigw/4IIlEHTW6vWWGMNjbe2wG4ZIANvpESJVfrjgltmhvLGFWRoWmNW68ZjJkdNR8LTnva0l7/85QQKYdHabNvyJz/5Sa3ZBWvn2h1DFcRTKDBNcW9omdRg7EvfiHcY/oMf/GD2T01dUkFeS2uzXD52ChpKNEStoksoD8vMJDlSheUcYtFbN59weYstthBMIXZN38knn3zZZZepJsiiWQtIsuVHHXWUZdbaiASk7M5W1ZqdlkfHiBp0XDvVAgUAVF2VfvGLX9SUDfGnI2uO+F202Xp4cywpIbl4cThFDLM5bGBv9lt7n/AbBC2QrTxAFso1hbnAUNUXS1pnnXUsE17lfKrgWndcdq0DzR4dHnpIjVUJ3XbYYYfVsvyetrOEvegJPg95yEMs1Hj9X/Oc2eTHP/6xQsdVbS6nh0Y0y/5r6vjUsgpm4Xe/+52oHIywV+V8jz2BofXuJHfHf2VPH9aUPXDpXLVd+bFJrpt72njeVmLhmFLRAZdrTFFlDs6zLpACLq9lG/SyR0EN5/TLL7+8l8qtdVybOoe6vFZYFqhZyx67ytcB1+UKOQCEM2Qq0H8uUp3WiVTlrvK5BO5///s7WbtUrc3dRaJlhWr6FltRrh2OCsmhMjCcwZUQBKzsd7/7XYYKagkKzaVttdP9m9WxR1fqpSGEEprjir1sS3Rw/V0ai7Zj3hhdjXOrMMmcE06dTGB5g9h7k87e0HacJSII7G6ZW7v4/ve/LwKi/mR9427Zd999bVjuGRf3aop3MPbGyP6tvfbarBdPBrVBV8nhYA7B7HKHs76x4m6H5kqp9BGCCckaI2eV+4Ym689k5d/4xjf861EnVN0f//hH452sZpXbu7nGoX6aKaEWUSqsPv7xj1eh49C1gRQWh66gTLlARKyEdSghUmbFFVfEvCq3fTtglNC79AQFpvHu94e3be4n9UBJMBIO6WYtRKXptcnvZbBWge8/WtJMeWVIlqZ+24L/KdN9/vnnUx6f+MQniHhHHQi09ac+9SlONYec6bOVM5uPgJSpd4pzzCikchzktKyb2JuWleBD1AqEKaQ1HYTSnG1rQ+07MFwkWEWLc7VS8zrs4DFftmrayUIIzAiBBWdkr6O409/+9reC/f6BuazlVPp/dsrw7z2QsThROuE2zmRtOhc7cTB+a665pm8lZIQzl2vlFVZYwRmf29xy595ZPmcoHgLxAmtZFJkKVY294exxpctIs0N0BinAVDsVOs8yh1z6ToJO+s5NdXplOF0O8mZrwYWda3Tn/c985jMG7gSnBakALuh1hkX50pe+5BoUFlfV2rcjg3LtSyWw0+hxzjuZOpPut99+dfpjWrjr6RWn2s6xdCmhAFhcFXganLWZ6qpMghA0TALDzGidc845UOgzP4RcCgrAWgEdZ21jYXrZSw5/16nqC0A4ffNSMCcMjBE5ZeuwrVz0My0speEw/2rCxRC6n4WrgFXo0tXOVa53zRFVwcdDSHEOcSSUCXEMYFuCzIbCK6QbVjrgMMDZ1FNvOixIpEsOBhf6LI3KJJ1qJJqxkCZm2RAMBH/lhM4DHvCAyayOwxuNEi4aBwc382jh3e9+t8YdopjX5pqqY7JzaK0lZt/hYSym3rjKYVC6sFFdBihw5mipDUsS2ZdBURsT7sXhhzkjqn1TT5USsgJtRIZoixbskRilVOrmI8cYGy8M5zKgUaUmkW/M5HLa+RcTnHJ4S0aBFzHT4VhF2JANgdCheGpbOkkmECvO9uu5w09Mx9HF12UrhOmwUuH6RhAYl6boWhG0Vjity/ZOkwntORT1CigSygUVQez/wgEgpuY/1H+rf0ZHuPa5oPRKKK21ndZl/1MOAC4i/6ekWP0766GmcOO9448p7Iam3MzW/FZNo/A/q31gYalglgWarKj6xtnNXNSYnvhpTjXSnMRIZ/UhEhdr7ViWQ2DGCDhG8+mFgLNPTZLzhetay5///OebDaUsODk6ayhhbJhwIsCpiu33sUnVdDpjQX2c0aqEcdUaA8Zwui5kTSkMq1wN1+6aDMeKIHBa1NWk01ATAamm6psht2FnbEU7wvPO1AwhwUFLuaa08M53vpNuIIMYOV11OiNZnJr55Jlkp3V71LLhMBj0kMtQ7hDddiovK0v8lXumnPAqS7xgb3TPibWc2zZ0GiXF2CRWuRI81fTT6bt6Pq1vVo0d4jx3hjU0365QtcBmgA+jkeJQTnjpDrSFtbYyZDKo9kU+qkkAOVlbpZBtIA4sGIKLYAuCLK6zywZokNUxduU+VI4oQy1P95t1ZCpqq7agQ2tqp+tmO+UgqZpgsi7mjtIiRwhTNolaqmPG3Okb5gbFzEhwIeZsXlEk2wrryCaZsKtW6Q9VxL+lTZPOhqnJZtfm7kzWVB1XjhafakfGiQPYsl0znJSHHFVyBBkSSpsOKhExmUB0CRPoqKBX9J+KcpgBSzTbnK31L8C+qkPHOyaNtHbR9l1+GplMLjwIWb43FWgayVs1WP9NFlh6WsqCY8PHQkXQyufXFDb/R5ygRk30UO21R4eNnw5XbrMKm1Y5kvQ6udAEhpQ7PzhmyO6q4xsr908ZI7VXadTNqs4FWBAgIKyq/+vWb4U19bWhalWzs52UhMCME3Ai8u/mU0HhGe9P04H7NEtZ6E7A2dP8sYXO5i59LFMzNmEmXR2645QfxQWfi34uaCdTV04uBF3ZuDCqEyhNoJABYJ/UdHq1ecXgXYdVZl+ZQNrCtqLd3NE2LxNlRwyz/ZJWtnLl7QzY2efJ5A6VQ5DVHl2DEiuWeQJ8C9C4PKXVnPq1WYUEmTM4Scf9o47RMQbO5gIoLuDqng51GC1rfTRedsulOZ0kzFfl9W0sWuCKoCFolNZV/S2jQdDYlkCBF5aSO36yxNJIeaTMQpvcUZ9brtQDI81w2oo33uYld1A1Zewi1KUhKqeEEiUyCFntU1TV5wnlDrvrAKAzqs5k36ygw4bzqeIp2leT1HBzFt1J2vIwVeyJHtUl2tSHS4OeEK3jXsKTGKp0Ud4LmzPA0nu1QF6YbsJUIdFD4XEw6DlnUmm4CXtFJLnop29UIyZKzTvSRCJAc6XOzWByyQWOKE4pql07/BYwOuos051cMo5hZPRBCVVnmT7W28c+9rGUOrelcp1Rbo4c6lwsSnyge8ELXqC3zpKWq7Dzm4iB19jJKSqtqWCC8CSgeXEEcZTLYi7lbZZdcugnDWH2ecV0w/+jqSTo1W8ayUIIhMAcEhhauZNgFkvX08fllEtPasZ1qss1Z1VXn7bk4pbdSSIwM8y8cyjbwH/AEc3tUU5pplRNjgFGzgnaZRyHuRaaHbt0ds8nYaF9hYwEjwKBxR0i0qS+QrpHB1wWO4M3G3YuuES2o+qbtXbkylL7dAy1oYQlc8FdbXZuPlmJIZAO7L0uGRpbpSafTYVgLOteSTfGXiypchVBsyMhGCkaLBxDxVi6KOc5cF1e+6IqXOLX9fdke+8sr9iKbZkuY1SB1cTK5TjpRvGYI5pMN8hTRo7xZu2IMKa0YmoSd0SpXIKXD4CvyLKOsaPkLFxVYlzcLaSAieOy4sBgpEGgQsxgBb/EXDgzSFjdYGgdIWbBqs5uNyUqC6NQBlqjGu3CKvFKXarbgCUIGxpo/HxoUz8qECKOLqwoYD4Y39rhhABZZygPQlZWB2eVgVQis45R0hQtGaHBpgOdC1wyPm3ljhaz2VqoHY03JTrfLFNdPs1PCzSlT2tJLetJZ2f8K4kldVZuK3HJ0VZSP4UIfWqZh9ICTSNHykRTihKG6C3/Vo4Nssx/onO6ucbZ8TlhgykMgRAYJwKRO9OYTRqCcfLcCyfTRk84k4qpUwMCNNrqYrZV41NxXd4pNZgQl8WuzstEseIu03nRGXWeBtesTs1dWm4dAz+TE7oL39qEneO9IHec6F3gqkmOiP2TLJbZV/qMyWRNORvIDtZawge7zhKIVZVCUpPMIrYYP33j1tIZAoLZk0PAYFB1lJBQHctK6gl/uLBmUaTLCGHwKDAw5Aj3gAYt0B/VZwaJ9RK/EIGqkh6/kbQt1wWNYix8DxQJ/xMFZl++eWVoDr3FwcDt0ailXlEYxuVyn+/KtuQXsaK3hILwEPeVsZTPgLIhQEUxxCyUc1+RpBwtSJJxrDXV6DAQPNJCE+2yuSmDpftAHANUl45xLQBblTl1SEaN+6nbjRl2yLW1pqZPW6FgpU9boZ9m1rHUWT72Jdx+VK+IEh1s0gsjB5KLCge5Y0N81mEz9hwywBAIgf8QcFbNpxcCfCRkQROIkU/KWNpQxoDTKGcAu8UTIJ1CoWt0bKkBy5IQpSJaEMPiB7IgymCtC3fL5RFxVV2OE/pDoWtcwQjGW2sEFkGgfaaR9aWWOFToA9Um+zClomOuZflRmHyCTE1ahNqQFaFlisS5nm+AFtGaEIkFTn5qRif5BjgMGGMfaRZCV8yD0bm7h2GmSygAUkObhsCuUx7EhD7TVRXkEroiEQzcN0ujJkVirRIWqDXQQysAK0l2srHMy/JywvHE1k6JG26eZsZ77AmlSHPwGHWvLwhIJIkT8cp0r5m1IRACITBCBIY2mDUfiP9RPflMRcDVPBFAQ8ic4AIR8hd0kGbLlyCdpdmaQ17OJgEhNYcngJeF24Dx4/bgV5DzqCZJIYmHP4CZF9EgMggpfgKX9UI8SlyJEgGCWRwJknj4FWzFQ8PjoiYB4TkZnbGApg8WNEUtSYDgwxfbqlUKeVY4LTTC1paTyX4dAwpJFl3y07e1BJatLFAkfDm65CeRpEv0UDWojvq2rZ+j/g0IdUhWir+4R4lTyh28XG7TGhcgiNVdPF02dHgI8IFpgkjYLjWzKgRCIARGiIDr/LpvQEDfGXV4eh6509NcsOu8IyURXPFLihTvIALc0szYywhpFIA6JBE9xE/DF0LxuF9JqgdvkJiCmBFzKBZGcJAOIjsyQ8UsXOuztbSIRBOpJ3WDaE89S6WBEjCV5s5cmEcyUYItRTLQPaSxEAiBEBhnApE74zy7czI2t+RIInHbLReLHB1yWEilXClz0my2HScCEowEQIUm5Z2M07gylhAIgfEjMLRyJ1euM3ywuaVL0IpHh6+I48edRNE6vU8JD5kblNx23gTset92ujXdFi7OyI3ndn2p5dPdvMf6HIEeG+MOI5ngtYlQqYQnwS9J5ZE7PWJMtRAIgRBoIxC50wZkXv8U5HIH+7ze6yD2R5xJ9xHLE/fpuz0t1I1IYkbTDRt5aIp8JtnQnvzbekeSUKObsGREeRZc0zHOMzua7i6azd3J5e4tcS4l0r3rVRLN2gEuyPsha5r7vLQswdzHU4DdpjfAHaWpEAiBEJhVBP6bczqrxpzBDoSA58HIOurlQSmT7U5Ot/vC3Nvlw6VB+kxWc8Jyt9y7EV0CuGcBt1agaeoJja2FHg/Tqn5aV0257MZ76VbunqOZ3EvFATPlJj1W0E93q9E3TX1PzXEfu8T2piQLIRACIRACc04g3p05ZzjvWmBu3Q7WdyKzbblkpFf312O3WAu6cZzU5h7v61b8vm2/vGwPzXOTmjRtgTxv+JIMTv2UD4MAshd3lsnpblwd9uW2bcOv+55E/dz1ZrlV7rhhXtSJw6luZ6uuulXe0/kEoYgJjXhMgDa9VsLTfkkZmTGeziIuRg+55a0NDmgeGez+uFe84hVW6W25o9R3c5zUZg0KpbmTzlMDPO9HTQ9K5rHTpuf+2S/tQtjJW0ceQw9NxtBWHtvjGUjutnPDv6bqrRcUpC65Qc/DAtp6kp8hEAIhEAJ9E4h3p290M7Chl1SX0e1v3541JyzS37YiRB4eKGHW7dkeIySbxHMRKQ9W39snvDGgXtFcjXtOjxTsJvtkwj3qDEtPDbhTkWn34GAvJaASiAO30NuEgOBK8VaH2lzyiocEegwjDSHiU4XEAe1S73moEo+PIzvcRt6IJOWkBv2kt5455D0JxIR26CoyyFoSygLR07zdsJqqb7rEKs8gqJ+eIuixipaVGCBPDMVmILrnWUdy9OpF4vYl8mU41JiheXiBcXnJhnu+KnaJj6dTuOldfjoN5Pa9at8DDnSSKq2f+Q6BEAiBEBgIgXh3po2R84BnwrNSZIfUxp65x2JxUbjKd7e55+J40E5za7qb2BWK1Mj/aH1EDQ+EEIlqlIT3M6jmWX8e7OvOrMre0CDrKDRjR/VoFm+l8ER/D/ezSjl/AMPvRUu77rorr4aH/Xh9gc+EQxJ78lxBPfeoQ1txddRjEj1zlt0lAjwjkVui6XZbIzrPqOu/h+/ZnF/k5S9/OQ3hznlignTQmhyaet2Bxwvpnqa83ku0S2Fba34aGungIYq1ijrxoEVW371ptSEp4+0Q2lHBQ43106OTeUHII3wa+Hw83CFeu1HtSFsWHiIyyJQq8S2zR3Ixz029UKzK+ZZE0Op5ifwu9mtQzSZtC028iRCxid4arwfnuKXOh2TxDEayj2Ayj7a1O2+Vkphl2XD8rCdPwuVxi9V4OaUQIIAMtm2P+RkCIRACITBAAvHuTA8mu+uxOp6m4xH+HiFoY7afv0EIQxKJC33v6PGyCD/ZzvpY9pYrr5jwbof/LfvPX5W5RlhlMqXe8cQq01ICH7JiVWAI5abYkZc32RGlwmHAM2HBQ33oG4V2p6YNWW6JIBwVCif8MLQ8B5QHnwTDXK+W5GLhqiGA5N7yozRPw+xswRNoZMsKNukh2085qUPDuTtaH4ghFTyAWCEfBtHmeULkBUT8QJ2tdZZoiiYAipITs0MGBAGpUi2EkWW6oe39TZ3tVEk9h3qytVVO4ngWFvKkqhCVt2R0qe+FGLUWOs+V1kkTyg3TbEIG0Vj8Uo3MEoaTXdRUqAXCtwQcVVSvFVNO5jqu2mpO9lMorRSVCqSkuBi9O1nllIdACIRACBSByJ1pHAm8Fy7ZGXgBHcasohI//elPGXXvkeCh4SPhafCiULZf1EloQ+vyOagB9ptDQkTDK6AVMm8MrULZG1rgDqk3+0jsYPXr7eskhRAJj4tbkdXk4CEgOAx4MrxEoiI+CldZZRUuDa+A8BJvC13Gw0j7qEA3+FhgLzVVbgaRoyYvp0sjratoFBEuJbwdyHByWOYmccO28I07pxQaZusmzTJ1iE+9ErUptGD4zHn1k7SqYJb4FG8KgUJytVbusiwC1bqWL80sVNxKCKlWmTVagQuq8mZa6zfL3GAEnJHqrewiN4KZAl4cW5GJps+Tlz05qdRS+ZxMsTcxqVyOHHBqv8YlymZEyJQSkntkHpt7vuyUGLUVxcxT2LywU/hMaM/x4AZ1arsJ3nkqt72j3fQ2CyEQAiEQAhMSSDBrQiwTF4oHuTqvmAWXjCRW9bxxWpClrtRV8NptGa8u9GVveMWVCuyl8BbVIhLETFYhO8ee0RysuG8KhkEVl2HmXb6LK9mw3vzMxvN2TNyhe+swrlJqJA6rQ77UVpPVb8o5pSg2faDGuCsEUygq/WwqdC7oW+WUcCzps0iWEltVTf2XwUMB0HmehUP0cGNIbRbx8RzqztaEpdxGLjTGY8ShpRtKVINCeEjmsiwchp8eIum0LJ+Xc8v7v9QRvINds77pD/nCcpJIPW8plzojYkVSEAFe8aFOxbm8C0LHJAl5sLUwljxl7ZgayUOWW4OMnV0lB01BPRadMNIZdTiiOLF4ffi6+MYqKYqfD0kvBRPgQ4A+rr6ZULqKJMXN6FTgjdMZIgZSSc2Oh9qvI4fiIc7EBxuHE0kka7sKObrqPbLq6xgF3AQEO3uekhAIgRAIgSIQuTONI6HyS7gcWFYmp6wOo84MC2FMln7BKLJznCisbNvOmDp+C8aM/6aSW4WTWLUu+qZakBvkWp/lIyx4EeT0kE0sMbvb5Ul0fE4MsBd80iicE5Jt6ST2kn01tD322IOFbuth60816Qz6gMKQp8ynZcg0igCZngiWcdioz5DrjwYJNeGzyl9pbaeWcfN6VHqF5DJeuqGcTCgp5BwCx47IFzXFsAAUPwKHsCA66RgeMg42PREuJJgIOC0Ti2SoLnGf8KlUofIXv/jFVKmRykrWYPWBRJOQ5OXz9bPLt5Rkj9vRT3KkqcazxX1lj4bQFPLtNcv6Zln8kVarQt47n1r2ChGaqalcCyVb2wrlaLeV1M97M7UmTtWasH4KQyAEQmD2EnAGz6dHAiIXrJ2gj2wPMRrCgr3kb+B+YHF5KaTRvOc972H/5OVIyKAnWP16qbgsGcaJhXNB72jjBHJBz67LwnHpL0JBGbjcZ8h1pjJFWHR7rPANl5JyDg974UFhMjUleMR3Yu/MIZHBbcDSdx+LF1LaO8PPm8LHwGcgXqZL3DCcHJW3O1kLxkWFCLSpqWOqGTshQnPgoEFRudpW0rFyEZnJmmrKCS/6gy8HQHnK8pebVXN1gUtGSMgouLjkHs3VfaXxEAiBEJg9BJocUNfPQzXqvCJ0ekqXd4QocbEu0EDu0Ci2ZzXdmGNBOVcNtwf7LdDDi+BbxgZHhQURE9ZdvMMj79SsAAq3gQ25LkgoUTAixk9pQG5UFpYikgRfKsOmvEeaEubQckW+VNa+EkeVCmIlSrp86uDTGZ8u1eb9KnFA+cKkYZc0mgH2SjqOHYnH8avVPVMDbDxNhUAIhMCsJSDNMW9EH5PZl20q8CRBeEzGMxzD4GsRoZPrzSs2D3pENfKNUZxJfJkHtLOLEAiB2UNgaOVOcnemfRB23l087SayQQcBKT6TZfl01B1AAfdY97vYBrCPNBECIRACITA0BKaIfQxNP9OREAiBEAiBEAiBEOiTQOROn+CyWQiEQAiEQAiEwKgQiNwZlZlKP0MgBEIgBEIgBPokELnTJ7hsFgIhEAIhEAIhMCoEIndGZabSzxAIgRAIgRAIgT4JRO70CS6bhUAIhEAIhEAIjAqByJ1Rman0MwRCIARCIARCoE8CkTt9gstmIRACIRACIRACo0IgcmdUZir9DIEQCIEQCIEQ6JNA5E6f4LJZCIRACIRACITAqBCI3BmVmUo/QyAEQiAEQiAE+iQQudMnuGwWAiEQAiEQAiEwKgQid0ZlptLPEAiBEAiBEAiBPglE7vQJLpuFQAiEQAiEQAiMCoHInVGZqfQzBEIgBEIgBEKgTwKRO32Cy2YhEAIhEAIhEAKjQiByZ1RmKv0MgRAIgRAIgRDok0DkTp/gslkIhEAIhEAIhMCoEFhwVDqafoZACIRACIwugbvv/ej//PPPb3HCgVj1r3/966Mf/eiEa/sunG+++bQ84eZ77rnn0ksvPeGqFI4ZgcidMZvQDCcEQmDMCVx11VXnn3/+AgssUOO86667br311mbMiyyyyG233db8nBsLiy666KGHHnrCCSfYV4/t6+Saa665wQYb3HHHHeedd95DH/rQe+65p3NbukTnzznnnM5VfZfceeedD3jAAw444AB96GxkscUW6yxMyVgSiNwZy2nNoEJgbAkwk0xm38OjDCZzLbS2ueCCC/7tb3/bf//9LbSWd1/WN1JAnblnRBdaaKHf//73J598ctOT9ddff9ddd7399tuVGNrXv/717bffvpcxNi30sfDoRz96ww037H1DU7bRRhttttlmNMepp566xRZb9L5taobAQAhM4z95IPtLIyEQAmNJ4Oyzz7744ounJQ5aORAKvfgkuDSuvvrqt73tba3b9r7M1m611VYrrrjihBf6re2Um+HKK6+00FrefdkQnv70p+vkC17wgsb70n2T6a4FiuIhpyzY1jcXS2s45tWvfvWSSy453WbnWX1YonXmGe3sqJVA5E4rjSyHwDQISDJwzTotc1itlyllGvvYtpf+LbzwwgceeOAf/vAHdrHqS1xYffXVWRp9VuLnUkstVfaylwanrEPl/PjHP/7zn//cn9zhilhmmWU++MEP6mH3XpEpyy233Je+9KUpuzRhBZuztcsuu+yEa8ejcJi1zngQzihGlEDkzohOXLo9BQE+gCOPPHJOrrDZ4C6ml11nni+99NIp+jHJ6sc+9rGbbLLJnARlJmn4P8VU1C233LLGGmuUnDKQJZZY4r3vfS9DePzxxyvkD2D1e/GmdNlL2yptzgltvWrEWVvL+RkCIRACc04gcmfOGY5tCzIiGTD+fM4Al8Us6KCGyrbJH9xtt92e//znH3PMMTfccEMZ5h7bZ7/ZclrBt+XOrXT4+uuvP/PMMy10ru2lRIbHLrvs8rSnPa2LJvja177WX/tgStvkbumlJ4OtI8jSNDj38kuaXWQhBEIgBIaEQOTOvJgIpr1209hm1p09PuOMMzbddFNxh2kZ+8l6zPTOyeV1W7O8FyeeeOLiiy9+2mmnWUVY7LTTToP1RnAw/Pvf/958883bdj3lT93Yeuutb7zxRt8N27atIB0gjbbG8zMEQiAEQmC0CETuTGO+dthhB66I6V4Tu5R3S4JvAmLllVe2ULtks7lPjj76aDkW0+jEJFU5IQRH3vjGN3bxRkyy6aTF2223XSVCUmlGvdpqq01adeZW9JcsMnP9zZ5DIARCIARmgEDkzjSgv+hFL3L75XTtK1mzzTbbcELYcK211prG/lI1BEIgBEIgBEJgEAQid6ZB8Rn3fqaxQaqGQAiEQAiEQAgMAYE+EzmHoOfpQgiEQAiEQAiEQAj0RCBypydMqRQCIRACIRACITC6BCJ3Rnfu0vMQCIEQCIEQCIGeCETu9IQplUIgBEIgBEIgBEaXQOTO6M5deh4CIRACIRACIdATgcidnjClUgiEQAiEQAiEwOgSiNwZ3blLz0MgBEIgBEIgBHoiELnTE6ZUCoEQCIEQCIEQGF0CkTujO3fpeQiEQAiEQAiEQE8EInd6wpRKIRACIRACIRACo0sgcmd05y49D4EQCIEQCIEQ6IlA5E5PmFIpBEIgBEIgBEJgdAlE7ozu3KXnIRACIRACIRACPRGI3OkJUyqFQAiEQAiEQAiMLoHIndGdu/Q8BEIgBEIgBEKgJwIL9lQrlUIgBEIgBEJgzgjcc889c9bA1FvPN998p5122o9+9KOFF15YbT/t1PdkW77hDW9YaqmlJlub8nEiELkzTrOZsYRACIw5gbvuuuucc865++67BzVOUuDmm2+ef/75F1100dtuu81P322NW3vjjTfuvvvuL3vZy4488sg77rijrcKUP3V79dVXf8hDHmJhyspzUmGBBRb461//evHFF995552bbrrpDjvssPPOO++77752PWG3F1lkkTnZXbYdIQKROyM0WelqCMx2AuUeqIt1y/WzRyis+FyytUzmUUcddeqppy600ELlVJhLRtTAb7311kMOOWRCy90jh85qr3nNa6666qpjjjlmiy22IH223XbbCUHtuOOOgBMQnS1MWaLDG2+88dOe9rQpaw68wp///OeBt5kGR5FA5M4ozlr6HALDQuD000+//vrrXf3PYYfYUZfjnBaT+S1Y+uuuu+6EE05Q4QUveIEAxBe/+MXvfve7PQoLjW+yySYcDBMa8jnsvOFfccUV1157LaO+5ZZb+vmiF72Im2EOm51wcxz22muvEnwTVphuIfIrrLCCnl9zzTX3u9/94F166aWn20jqh8DwE4jcGf45Sg9HmIBr5cUXX/yGG25gZVncGokL9IGYK5b+lFNOYfg32mgj5ord2nvvvRndORcf1U8NLrfccrr6r3/9a+WVV+6cBjv69re/ffXVV8+haWdiV1xxxSc+8YkPfOADN9hgA2Pp3JcSPXnWs55l4ZZ7P9tssw0/xIQ1OwvBf+pTn7raaqt1rkoJApxS973vfYMiBMaYQOTOGE/u7Bral7/85Yc97GEspW/L/Q1+MkPbX2tEwBFHHLH99tsfcMABl19++YMf/GB2XeFWW201kB0x/+y+bz4P0kQnNTtAoyX685KXvESH8Xzve99rR50cPvCBDwxEXWlkwQUX9D2HyqmzhykJgRAIAQT+k7UeECFQBG6//fbyRlx66aUT2rZeQN10002MuotFh1bjz2jdkD3bZ599LrnkEuattbz3ZVmHHBtth+5JJ53EN2CPvi333lrVLB/+Rz7yEekXbS1Pt6nW+jigKglUIQeJvbDoHBitdbIcAiEQAmNDwJ1xm2++ueGceOKJYrvDM64+7c3wDGD4e8J2zrn5ZCMPPvjgdddd94ILLrjsssuaK2Dl5TCYcw7a1PL5559vL5/+9Kf7bnDXXXddfvnlzz77bG6G9dZbb0LFQ5GstdZa/e0CTLkLdtGWhEFYKKHSfFvur/FsFQIhEAIhMJYEInemMa2vfvWrjz322MUWW6z3bWgR2Q8bbrhhm23uvYWqSdb87ne/k0goi4L7pFwvDP9OO+102GGHffCDH1xppZXmcBd2ZC88LgQKyTLdHjb111lnHQ4S/dTUkksu2ZQPfKHTOVQljRYc+B7TYAiEQAiEwIgSiNyZxsRtvfXWD3rQg9jy3rep2Mrzn//83jeZbs0999xzupvMg/qU2TzYS3YRAiEQAiEQAr0QiNzphdJ/6zz73s80NkjVEAiBEAiBEAiBISAwp0/LGIIhpAshEAIhEAIhEAIh0I1A5E43OlkXAiEQAiEQAiEwBgQid8ZgEjOEEAiBEAiBEAiBbgQid7rRyboQCIEQCIEQCIExIBC5MwaTmCGEQAiEQAiEQAh0IxC5041O1oVACIRACIRACIwBgcidMZjEDCEEQiAEQiAEQqAbgcidbnSyLgRCIARCIARCYAwIRO6MwSRmCCEQAiEQAiEQAt0IRO50o5N1IRACIRACIRACY0AgcmcMJjFDCIEQCIEQCIEQ6EYgcqcbnawLgRAIgRAIgRAYAwKRO2MwiRlCCIRACIRACIRANwKRO93oZF0IhEAIhEAIhMAYEFhwDMaQIYRACIRACMxmArfccsv8889/9913X3fddfPNN1+PKO65555FFllk2WWX7bF+qo00gcidkZ6+dD4EQiAEhpfA4Ycffu211xIic9hFusRnskYWXnjhs88+2/ett9561FFHTVZtwvItttji5JNPnnBVCseMQOTOmE1ohhMCITADBK644gpOAhbXvi+77LJVV131oosuYn179zQ0nb755pt5Ke573/vefu9n0UUXve2225q1c76w0EILvfOd77zgggsIiFe96lXf/e53r7nmmlIk9rvccsstsMACd91113rrred7TnZn7GeddZbh9AGhdb8w6PDDH/5wC63lrcuPetSjDMeO3vSmN3URRq2bWDbeZZZZpq0wP8eVQOTOuM5sxhUCs44AO9fFIsLB0n/84x//y1/+suCCfZ76tGAvbZvTByeccMLGG29cYZHjjz/+mc985qGHHkpG9DEHr33ta1dbbbX9999/yy23fMITnvC1r31t6623tt8+mppsk8c85jEbbbSRtXBxbzTV7rjjjoc97GH2ZeHFL34xNdCs6m9hzv06/e03W4VAJ4H5ehfCnRunJARCYPwIXHrppeeddx4TPtnQnDT4Lebw1CFngiFndNddd92f/OQnBx10kJLJ9thLOdtMbWy66aZdjDTre+GFF9500019+xu23357Po9tttmm8bjw6JxzzjmXX345iSODRFcNxNrFF1+8D2OPKq2jzUsuuYSjZYkllvjnP/9poe8O94IudUJggAROO+20zTffXIMnnngiyT7AluewqcidOQSYzceWAIt155139j08dquMX1sLZITAx7ve9a4+bGFbU/XTjgQ+fDONE1aYVqEr+1//+tenn376ZPbVjhZbbLH99tvPt+VpNd5WGQrSRCNQdFFXbVtN9pMKWWmllbbddtvJKsylcvs10QOBP5d6mGZDYF4SGFq506dHd16yy75CYDICp5566t///veylAwnD/xkNadb7gJdMML/bX9BBCZcC9ttt53N2zQBGaGfjW9guh3rrK+pTTbZRJt217l2uiV6y7Wg823dbm3HEJZaaqnJ9FBrzdmwfMABB9CvBx988GwYbMYYAqNLIHJndOfuPjfeeCNryjJdeeWVrN2cmx+KgT/DpzMcYC+f+9znTjnlFAt9I2NBV1999XJF9N1Is6H8CXJEEIHckYLwyEc+cu+99+6eutFs28vCHnvs8Za3vKWXmp11SjQ85SlPmXOnRWfjKRkqAlTm8ssvP1RdSmdCIAQ6CUTudDKZoxKyw3Vef3dkdNlx56W2S3DiQ5j/IQ95iOtLd3t22bzHVSussMLaa6/9wAc+cJVVVmm7KYOWMii3P8yJqAJnt912k6thoccuda9GezV6QsfmpG/dd5S1ITAZgQc/+MGTrUp5CITA8BCI3JnGXNxwww3ucuzuR6ESPAGie51p7PLeWyWpEJlfZM25557Lh7HWWmvZCwH0+Mc/nhvG8g9+8IM5TwTRoORKt2UuvfTSFqbVyelWnhMX0XT3lfohEAIhEAIhELkzjWNgySWXFNCZcoP4GKZElAohEAIhEAIhMC8JRO5Mj3akzPR4pXYIhEAIhEAIDAGBOX229xAMIV0IgRAIgRAIgRAIgW4EIne60cm6EAiBEAiBEAiBMSAQuTMGk5ghhEAIhEAIhEAIdCMQudONTtaFQAiEQAiEQAiMAYHInTGYxAwhBEIgBEIgBEKgG4HInW50si4EQiAEQiAEQmAMCETujMEkZgghEAIhEAIhEALdCETudKOTdSEQAiEQAiEQAmNAIHJnDCYxQwiBEAiBEAiBEOhGIHKnG52sC4EQCIEQCIEQGAMCkTtjMIkZQgiEQAiEQAiEQDcCkTvd6GRdCIRACIRACITAGBCI3BmDScwQQiAEQiAEQiAEuhGI3OlGJ+tCIARCIARCIATGgEDkzhhMYoYQAiEQAiEwbQL33HPPTTfdNO3NssFoElhwNLudXodACIRACIwVgR/+8IfnnnvuQgstNCejuvvuu+eff3465q677pqynTvuuOOrX/3qUUcdtfbaa09ZORVGnUDkzqjPYPofAiEwiwj84x//YM4Z9RtvvHG++eZrHfnNN9+84IILLrLIIrfeeuvtt9/etra15pwsL7DAAnvvvfcll1xiX9NqR5/XX3/96667bplllllsscUoktbNNfub3/zm73//u4XW8mkt28XOO+/8uc99bt11133Pe94DwpSbH3jggYsvvviU1VJhDAhM73gdgwFnCCEQAiFQPoA777xzQhQM+emnn37cccctvPDCE1bopbBEyZwY78696M+XvvQlWuGWW2658MIL2yq86lWv+utf//qjH/3oGc94xuMe97hejH1bCz3+3GCDDQiXHis31dDec889zznnHJuvssoqnd4X2EFr6ve9sN9++/W9bTYcYwKRO2M8uRlaCMxFArfddtuvfvUrUYCLLrpoLu2GA4DHYrLG2X4dcHXOnzFZnQnL2VRX/+edd95mm22mkTY3g01UuOKKKzgw+ra+NnzlK1/5xS9+8aCDDlp66aWpqwl7Mt1CXd12221rqzbnilVrrbUWXJdffvlqq63GgzLdxudN/VVXXbV2NFghOG86n72MNIHInZGevnR+JgkwMGL/A+mBC1/qgel1yTuHplEjZ5xxxuGHH14iQESDM6BEAxu81FJLDaTDmrr++uu/8IUvbLHFFqeccspA2uxshFB417ve1SUooxvvfOc7OzfsXmKrpzzlKbwgO+2006KLLtq98pysfeMb3zgnm/exreEsu+yyfWyYTUJg7AlE7oz9FM/qAV555ZWnnXZa23Vw70SIj8kEjTYvvvhixrjvxptu0DrrrbfeJptscvbZZ6+xxhrLLbfcnCge4uCGG264+uqrGXWCjNZ54QtfeNhhh/m58sorv+lNb5qTxps+W7CjD3zgA1TadJ0rrY10X9ZnNLrX6XttH+GYvveVDUMgBGacQOTOjE/BXO+Aq3A2qcslcu89cNMmC9p7/R5rspdHH330N7/5zTKcvNy88XPeYe1ceumlJ598cn+KROqDBIjdd98dvc6B0Cirr7662zo6V023RFPrrLPOox71KLelcPXPjTDEy172sun2KvVDIARCYJwIRO7M69l02X3kkUcOKm5dV+pdJIjQxic+8YkLLrhgzsdJNOy4446dt1TMecuUjdTLNddc04IRCbjsu+++7kftMq4ed4rznPgeeBf6k0o9dq+tmhTOtpL8DIEQCIEQGAiByJ1pYOTb2H777d0Q0YtYYap5y+973/u22mzmk6/lzDPPtDCNHU9SVcuu2vkenvOc50zogajtDj300F46PMlO/q9YnzfaaKN5af7/b99ZCoEQCIEQCIE5IBC5Mw14tMtee+317W9/m8tkys1EKF7/+teLd1horcyBMRDx0dpmlkMgBEIgBEIgBLoQiNzpAmeCVY+/9zPBismL4g6ZnE3WhEAIhEAIhMC8IDCAkMq86Gb2EQIhEAIhEAIhEAL9Eojc6ZdctguBEAiBEAiBEBgRApE7IzJR6WYIhEAIhEAIhEC/BCJ3+iWX7UIgBEIgBEIgBEaEQOTOiExUuhkCIRACIRACIdAvgcidfslluxAIgRAIgRAIgREhELkzIhOVboZACIRACIRACPRLIHKnX3LZLgRCIARCIARCYEQIRO6MyESlmyEQAiEQAiEQAv0SiNzpl1y2C4EQCIEQCIEQGBECkTsjMlHpZgiEQAiEQAiEQL8EInf6JZftQiAEQiAEQiAERoRA5M6ITFS6GQIhEAIhEAIh0C+ByJ1+yWW7EAiBEAiBEAiBESEQuTMiE5VuhkAIhEAIhEAI9EsgcqdfctkuBEIgBEIgBEJgRAhE7ozIRKWbIRACIRACIRAC/RKI3OmXXLYLgRAIgRAIgRAYEQKROyMyUelmCIRACIRACIRAvwQid/oll+1CIARCIARCIARGhEDkzohMVLoZAiEQAiEQAiHQL4HInX7JZbsQCIEQCIEQCIERIRC5MyITlW6GQAiEQAiEQAj0SyByp19y2S4EQiAEQiAEQmBECETujMhEpZshEAIhEAIhEAL9Eojc6ZdctguBEAiBEAiBEBgRApE7IzJR6WYIhEAIhEAIhEC/BCJ3+iWX7UIgBEIgBEIgBEaEQOTOiExUuhkCIRACIRACIdAvgcidfslluxAIgRAIgRAIgREhELkzIhOVboZACIRACIRACPRLIHKnX3LZLgRCIARCIARCYEQIRO6MyESlmyEQAiEQAiEQAv0SiNzpl1y2C4EQCIEQCIEQGBECkTsjMlHpZgiEQAiEQAiEQL8EInf6JZftQiAEQiAEQiAERoRA5M6ITFS6GQIhEAIhEAIh0C+ByJ1+yWW7EAiBEAiBEAiBESEQuTMiE5VuhkAIhEAIhEAI9EsgcqdfcpNs96c//emPf/zjJCtTHAIhEAIhEAIhMAMEFpyBfY7vLi+88MLNNttsoYUWOu2009Zee+3xHWhGFgIhEAIhEAKjRCDenUHO1l133TXffPPtvPPOD3rQgwbZbtoKgRAIgRAIgRCYAwLx7vQJ79///vfBBx/83Oc+99hjj33lK1+5wgorXHvttfvuu+8999yz6KKLEj3V7ic/+cnjjjtO4e23377ffvttuummE+5v7733/tWvfqXaxz72sZtuummfffaZf/75l1lmGXvR1K233nrnnXdyF336059ebLHFJmwhhSEQAiEQAiEQApMRiHdnMjLdyn/xi1+svvrqxxxzzIYbbrjXXnu9733vU/uss85afPHFF1xwQWtr49///ve77bbbKaec4ucCCyyw5ZZb/vSnP52w3a222urkk09+6lOfusEGGzz2sY999KMffcUVV+y+++6iYxpZZ5111ltvvW9+85sf+tCHJtw8hSEQAiEQAiEQAl0IxLvTBc6kqz7zmc8sv/zyBMpll1321re+dZdddlF163s/L3zhC2+55Zba8uEPf/jznve8884777vf/e7dd99tlQ2f+MQndrb7pCc9ydptttnmzW9+89JLL73ttttKAFKTovrIRz5yyCGH2ORnP/vZbbfd1rltSkIgBEIgBEIgBLoTiHenO58J1p5//vlf+MIXVl55Zbpk/fXXX2mllVZbbbUJ6t1bpNpSSy0lvMXx8/SnP/0rX/kKb01nZaGrZZdd9pnPfCZhdPTRRz/nOc8R2FJtjTXWWHjhhTvrpyQEQiAEQiAEQqB3ApE7vbP6b80lllji/ve/v6zkXrYUnJJ5c91116l81FFHvfSlL+XyqQ3POeecc889t2lk3XXX/dvf/kbl/OEPf7jqqqsEsKwig/75z3+efvrpomDXX3+9n039LIRACIRACIRACPRIIOazR1D/V23VVVf9wQ9+cOONNwo/+UgxrnVSkv2UwaNECvMRRxyhfMUVV/zd734npLXHHnvQPa973euahuQ1S9a5+OKLq8QqjqL9999/1113pXWkPyvnFiJ0JDiLdr343k9VzncIhEAIhEAIhEDvBCJ3emf1fzWlEh9++OF8Nm6Y+stf/kK4WLf55pu7bWqjjTaSc+OOqjvuuEMhx4wsHOnMbrb64Q9/+LjHPa5pZeONN37DG96w5pprVokcZ48ofPazn/3hD3+4SXY+44wzSCJ5P277cmOX2FmzeRZCIARCIARCIAR6JJBU5R5BtVcja0466SSl/C7CTBaWXHLJ8ui0ViV6RK/U4adpC0W95S1vaa1pWTJQlTT5OuJfPttttx0Z1FY5P0MgBEIgBEIgBHokEO9Oj6AmruaJgoJNso+vueaazhpk0G9+85tf/vKXolSeptNZYcoSYSzun1//+tc9pgpN2WAqhEAIhEAIhMAsJBDvTv+TToK8/vWvF7dy7xXXTmdDnrXjSTmLLLKIBXdmdVaYskS6DsXj/qwpa6ZCCIRACIRACITAZAQidyYjM3U5EfOIRzzi61//+mRV3cP1+Mc/frK1vZTbRbROL6BSJwRCIARCIAS6EEgwqwucrAqBEAiBEAiBEBgHApE74zCLGUMIhEAIhEAIhEAXApE7XeBkVQiEQAiEQAiEwDgQiNwZh1nMGEIgBEIgBEIgBLoQiNzpAierQiAEQiAEQiAExoFA5M44zGLGEAIhEAIhEAIh0IVA5E4XOFkVAiEwKwjcdttts2KcGWQIzGICkTuzePIz9BAIgXsJfOUrX7nnnnsCIwRCYIwJ5DGDYzy5GVoIhEBPBHbcccee6qVSCITAyBKId2dkpy4dD4EQCIEQCIEQ6I1A5E5vnFIrBEIgBEIgBEJgZAlE7ozs1KXjIRACIRACIRACvRGI3OmNU2qFQAiEQAiEQAiMLIHInZGdunQ8BEIgBEIgBEKgNwKRO71xSq0QCIEQCIEQCIGRJRC5M7JTl46HQAiEQAiEQAj0RiBypzdOqRUCIRACIRACITCyBCJ3Rnbq0vEQCIEQCIEQCIHeCETu9MYptUIgBEIgBEIgBEaWQOTOyE5dOh4CIRACIRACIdAbgcid3jilVgiEQAiEQAiEwMgSiNwZ2alLx0MgBEIgBEIgBHojELnTG6fUCoEQCIEQCIEQGFkCkTsjO3XpeAiEQAiEQAiEQG8EInd645RaIRACIRACIRACI0sgcmdkpy4dD4EQCIEQCIEQ6I1A5E5vnFIrBEIgBEIgBEJgZAlE7ozs1KXjIRACIRACIRACvRGI3OmNU2qFQAiEQAiEQAiMLIHInZGdunQ8BEIgBEIgBEKgNwKRO71xSq0QCIEQCIEQCIGRJRC5M7JTl46HQAiEQAiEQAj0RiBypzdOqRUCIRACIRACITCyBCJ3Rnbq0vEQCIEQCIEQCIHeCETu9MYptUIgBEIgBEIgBEaWQOTOyE5dOh4CIRACIRACIdAbgcid3jilVgiEQAiEQAiEwMgSiNwZ2alLx0MgBEIgBEIgBHojELnTG6fUCoEQCIEQCIEQGFkCkTsjO3XpeAiEQAiEQAiEQG8EInd645RaIRACIRACIRACI0sgcmdkpy4dD4EQCIEQCIEQ6I1A5E5vnFIrBEIgBEIgBEJgZAlE7ozs1KXjIRACIRACIRACvRGI3OmNU2qFQAiEQAiEQAiMLIHInZGdunQ8BEIgBEIgBEKgNwKRO71xSq0QCIEQCIEQCIGRJRC5M7JTl46HQAiEQAiEQAj0RiBypzdOqRUCIRACIRACITCyBCJ3Rnbq0vEQCIEQCIEQCIHeCETu9MYptUIgBEIgBEIgBEaWQOTOyE5dOh4CIRACIRACIdAbgcid3jilVgiEQAiEQAiEwMgSiNwZ2alLx0MgBEIgBEIgBHojELnTG6fUCoEQCIEQCIEQGFkCkTsjO3XpeAiEQAiEQAiEQG8EInd645RaIRACIRACIRACI0sgcmdkpy4dD4EQCIEQCIEQ6I1A5E5vnFIrBEIgBEIgBEJgZAlE7ozs1KXjIRACIRACIRACvRGI3OmNU2qFQAiEQAiEQAiMLIHInZGdunQ8BEIgBEIgBEKgNwKRO71xSq0QCIEQCIEQCIGRJRC5M7JTl46HQAiEQAiEQAj0RiBypzdOqRUCIRACIRACITCyBCJ3Rnbq0vEQCIEQCIEQCIHeCETu9MYptUIgBEIgBEIgBEaWQOTOyE5dOh4CIRACIRACIdAbgcid3jilVgiEQAiEQAiEwMgSiNwZ2alLx0MgBEIgBEIgBHojELnTG6fUCoEQCIEQCIEQGFkCC45sz9PxEAiBEAiBeUrg9ttvv+CCCx784AcvssgirTu++uqrL7/88tVXX33ZZZdV/u9///vMM8/8+Mc/vsoqq/h52WWXbbfddk94whNUWGyxxZT8/e9/v+eee9ZYYw3Lt95660UXXVQ/l156aSVtn+uuu+7SSy+94447brvttvnmm2/xxRdfaKGFHvKQh1hurXnnnXfq21133XXzzTers+CCC66zzjoLLLBAU+emm27685//rIerrbZaU2hB49dee+2iiy6qvq6uuuqqCrWjtbvvvlvHmvEa44orrlht2osNf/7zn3/oQx/Sk/e+973rrrvummuuufDCC7c2fuWVV1511VXzz/9fz4LWEHjQgx7UWifL84gA+vmEQAiEQAiEwJQE/vKXv7BMO+200zve8Y699trrAx/4AFmz9957b7rppso/85nPaIEo2Xrrrf3cd999q8Hzzz//AQ94gBLioEpe8pKXrLzyyn/961/9fOc732mVzwte8AIiQ8mPfvSjj3zkI1WTiHnOc55j7SabbPL+97//3e9+933ve1/y4utf/3pVaL4POeQQ1e5///vvv//+NI06u+66q103FU499VQV1ltvvbe97W0nnnii+vvss8/vfvc7MkU57UXcvPKVr6z6//rXv+xIuU8zkBe+8IVf+MIXqsKee+5Za1u/DzzwwGZ3FqiojTfeuLWC5RVWWAE97bfWHKfl4mykIA/VuO4zVL1JZ0IgBEIgBIaWwAknnNBqvDktllhiiaak5M6rXvWqKuHwaAbypje9SeFmm21WJVSFnw9/+MMpj6rsW2u8QbZaZpllHvnIR1bN173udVXh85//vBI+npVWWknJM57xDLqq6vg++OCDy4Ny3HHH+fntb3+7tmpkk8LHPvaxVei7XEQWllxyySq04aGHHmqZ68UwKZsq982TdPbZZ3/iE5+wvOOOO2qqlv182tOeRrh88YtfLHfX+uuvf80116hQn2c+85nVCIcQuca/1XikuLv+t9a4/Y3cGbcZzXhCIARCYLYRePKTn8x+c4cceeSR/ByHH344iWDB53Of+xzfzx//+EfhnrLxrXKHk4bzhiL5zne+A1rJHfEmOoDy4Eepj0hQhYoe9ahHFdtnP/vZWltrrbUa1I03qPHE2HvJCE6m66+/Xk0SZPPNN7ehPX7pS1+qbTWihOj52te+dthhh6lgw+o/LSUed8455yy//PLVefEyCqaJTDUy5TWveY1Q3SMe8YiqVsPRPhRVApHwnBJWv7QgDUR+iQOKf+G23377qXm/+92v8XVV98bme2jlTnJ36hDNdwiEQAiEQE8EvvKVrzz60Y+esOqxxx4rW6VzFf0hRCVa9KIXvUgGDPVACpxyyikPe9jD6IDW+twtBMFXv/pVhRdeeOG5557LEfK3v/3NtuW/oaJ++9vfHn/88WeccUZtyPVCK1gmLJZaaikL/EPf+ta31l57bfulKl72spcprDpa8/HTN6VlofkIkBE9HEs23G233W655RbpPgpl5AjYyeyRhUMAkVlCYLayzGFTm++www7nnXeePJ6f/OQnp5122pZbbvmHP/zhxhtvtNZ+n/SkJ9FPPttvv70SiD75yU/KDXrMYx5Tm+d7HhD4b/7UPNhTdhECIRACITAGBHgvJhvFE5/4RPpAAo0KfCd/+tOfLLD3NIooj2URKJKCL+d5z3uevBaCgzRp/XDPHHHEERKHVZYjTFf985//lNPjZ/NpnC5KSKJjjjmmWdUsaLlcMqU5lOsGf1JpHT/btI4S2oWy4b/55je/Kfb09Kc/XdrQUUcdJWdIDjXFo46u0kAWOj/VZ7KMr8jaJsxHBj3lKU/ZYostyLvaivwi3doG1dlgSgZLIN6dwfJMayEQAiEw5gR4aHbffXfxKeMkIMp78fjHP95PQkfWi7Rf2coykbfaaispvbwvgjgNlF122YWnx1ZNSbPA4SEPxoaV0/PmN7/5G9/4hrXiUOXaaWo2C1TUhP6kpoIGJQBttNFGWtBnXXrFK17BcyNXmkumqWZBlEoghuJpLWyW3U1GIdFtMpCawi4LnECf/vSnf/GLX6jD0+Pzq1/96l3vehdVJLZFNjURsS6NZNUACUTuDBBmmgqBEAiBcSYgQYcmoFSkoUhtOeigg77//e8roXKkwkiIqczfxz3ucUcffXRlGX/0ox8VxBFm4uQoiVPfTTZM8ZKD/MY3vpEOuOKKK97ylrcccMABqvGsdNIU3vrpT3+qnJPphhtumEwGtW5Ye5Ry5CZwsTOBJLvTSbe4f/azn9U9ST9um+/UMSrzHvHrSEsiwj71qU9plv+mtfEuy4JiJXeqjt1VyrafnD2RO13QzY1VkTtzg2raDIEQCIExJCC61DoqckTgiRtDvEkASMjmta99bVWQsUuXNJWJGA/dKYeQnOWmvFl4wxve4M5wCoDQKafLZKpigw02EDIjpzz2hnDZZpttmka6L7TdVqbyD3/4Q42Qbrr63Oc+lwRpa4Gea+4d4yXiFmqr0P2nm967V8jaeUkguTvzknb2FQIhEAJjRUAMi46Z1pCEk1rr8/pwDp1++unCPT/+8Y+bABO3zQc/+MHWms3yoGTEU5/61Eoz0rJ7ypr2a8HzexqtI4bVaJ0Pf/jDbTUn/EkeCV25OZ9Tx+YynVuryeOR1t1akuW5TSByZ24TTvshEAIhMA4EKJLnP//5Un15RJrxyBqWvlM/Kz5Fo8jwde9Sq3um7s1WTf228JMIkfwbakOgRwpz07IFD9eRc9NaUsvUg1ucVN522205nOr2eKvabvJqNlSTtpAc7ZlAfDlNeeuCJ/S0/uTX8TDAKpGK9NKXvrRZu9xyyzXLEy5UzrJnDHq4IneXNGc3z7tda8MNN7Sq7rS3YcXUJmwhhXODQOTO3KCaNkMgBEJg3Ah4A4MQkhuv3DzVjE1ib711QYnMGOEhT1v2NBq3c7sru6km6bhuRJIHU09Yblb97Gc/q+WzzjpLBnFTbsHDkSd838LJJ5/sbiwP+JESREDUY5HV9/Cb1lyZaurVr361atKlJQYJvTW3r7fuqHNZ/aaQqKpn9lSJR+Y89KEPbda2LUhpmtApJTOaXuTpaeJ97pDvco9bW7P5OecEInfmnGFaCIEQCIFZRKBuKTdgmcIekOOGoxq8m8w5WppYlVyc3/zmN9J6PLD4pJNOUkdqjhdQTEaKuGk8H00dz/0jaKQJf+xjH6vXZtE6bv7ipGkeCfj617++kVCVHsTN41E9lt0N7iaypjWSq3keYFNYCxO+FKJWEUytlXl3vve975Ev9iLg5WE8niQEAi3FcaUndkq3SRWi/7yES4Xa3E9iqB69o2TC8bbuKMuDJfBfJ+RgG01rIRACIRACY0bAk445VNz1LbKzxx57MO1uxfIk4homEaACvSJn2auj/vGPfxAWrU8j9ETBH/zgB80zlzvhkA6dz8IR8aGQPLXPHrmObMUJxL0kfYerqRqRvExOeZ8DiUNzcMx4VZZbrqyVcMO107ovjwJyE1YT9nJbWa1tq9a6Seey8JbQm6CeR+xorangcYJcOH66F0x2jic4ex3pAx/4QJXdfs8j5Q6vpv6E422aysLgCZiVfEIgBEIgBEJgSgICTzRN5aa0WSM5Pa2bC9NIxyF93Iy98847W+bnaK3QLBMxlJDWvvzlLzeFbQv2yzNE1ni/Jl3iI32nrc7vf/97DykWXOM14USRSCSth/ioapxMwnDNG7LaOs8jxfHT2iDBpOdVTQ5166rW5bo1varpm362rpXqVKt0RjisVczpKg9Qa+WxWR7al0jMB3HbxOdnCIRACIRACExGgC4RoGkNPMlKdkeVpJbJNpmynDPGw2/omO413bpF93hG32TVOJ9Evi666CIRN89HblNmnEByqD1op3VzMTJ6S1p0a6FlEk3QyjC5iCZLT5bPJCVIgxrR/7b3n+tDRbi0JtKnWqVpi7Jx+bj1vW2P4/FTUna9sMwb0bnlhmdQkTvDMxfpSQiEQAiEQAiMNoGhlTvJ3RntAyu9D4EQCIEQCIHhIcBF5yWyAkf8cMPTKz2Jd2eopiOdCYEQCIEQCIEQGDyB3Ig+eKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMnELkzeKZpMQRCIARCIARCYKgIRO4M1XSkMyEQAiEQAiEQAoMn8P8AZJA4DPH5A3EAAAAASUVORK5CYII=", + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from PIL import Image, ImageDraw, ImageFont\n", + "import os\n", + "\n", + "base_path = \"./data\"\n", + "\n", + "image = Image.open(os.path.join(base_path, \"training_data/images/0000971160.png\"))\n", + "image = image.convert(\"RGB\")\n", + "image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uAVffmnZyUvw" + }, + "source": [ + "Now let's plot its corresponding annotations. Basically, if you type `data['form']`, you get a list of all general annotations. Each general annotation has a label, a bounding box, and one or more words, which in also have their own bounding box. The bounding boxes are in [xleft, ytop, xright, ybottom] format.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JPKkuJQ4sdZc", + "outputId": "c95bf306-98bb-4480-cc6b-ebb3aea548b3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'box': [292, 91, 376, 175], 'text': 'R&D', 'label': 'other', 'words': [{'box': [292, 91, 376, 175], 'text': 'R&D'}], 'linking': [], 'id': 0}\n", + "{'box': [219, 316, 225, 327], 'text': ':', 'label': 'question', 'words': [{'box': [219, 316, 225, 327], 'text': ':'}], 'linking': [], 'id': 1}\n", + "{'box': [95, 355, 169, 370], 'text': 'Suggestion:', 'label': 'question', 'words': [{'box': [95, 355, 169, 370], 'text': 'Suggestion:'}], 'linking': [[2, 16]], 'id': 2}\n", + "{'box': [482, 268, 518, 282], 'text': 'Date:', 'label': 'question', 'words': [{'box': [482, 268, 518, 282], 'text': 'Date:'}], 'linking': [[3, 12]], 'id': 3}\n", + "{'box': [511, 309, 570, 323], 'text': 'Licensee', 'label': 'answer', 'words': [{'box': [511, 309, 570, 323], 'text': 'Licensee'}], 'linking': [[13, 4]], 'id': 4}\n", + "{'box': [211, 651, 217, 662], 'text': '', 'label': 'question', 'words': [{'box': [211, 651, 217, 662], 'text': ''}], 'linking': [], 'id': 5}\n", + "{'box': [461, 605, 483, 619], 'text': 'Yes', 'label': 'question', 'words': [{'box': [461, 605, 483, 619], 'text': 'Yes'}], 'linking': [[19, 6]], 'id': 6}\n", + "{'box': [545, 603, 563, 617], 'text': 'No', 'label': 'question', 'words': [{'box': [545, 603, 563, 617], 'text': 'No'}], 'linking': [[19, 7]], 'id': 7}\n", + "{'box': [525, 904, 641, 926], 'text': '597005708', 'label': 'other', 'words': [{'box': [525, 904, 641, 926], 'text': '597005708'}], 'linking': [], 'id': 8}\n", + "{'text': 'R&D QUALITY IMPROVEMENT SUGGESTION/ SOLUTION FORM', 'box': [256, 201, 423, 230], 'linking': [], 'label': 'header', 'words': [{'text': 'R&D', 'box': [257, 203, 279, 214]}, {'text': 'QUALITY', 'box': [285, 203, 334, 216]}, {'text': 'IMPROVEMENT', 'box': [341, 201, 418, 211]}, {'text': 'SUGGESTION/', 'box': [256, 215, 324, 229]}, {'text': '', 'box': [324, 216, 332, 230]}, {'text': 'SOLUTION', 'box': [331, 214, 387, 228]}, {'text': 'FORM', 'box': [395, 215, 423, 228]}], 'id': 9}\n", + "{'text': 'Name / Phone Ext. :', 'box': [89, 272, 204, 289], 'linking': [[10, 11]], 'label': 'question', 'words': [{'text': 'Name', 'box': [89, 274, 118, 289]}, {'text': '/', 'box': [117, 274, 127, 288]}, {'text': 'Phone', 'box': [128, 274, 163, 289]}, {'text': 'Ext.', 'box': [169, 272, 196, 287]}, {'text': ':', 'box': [196, 274, 204, 288]}], 'id': 10}\n", + "{'text': 'M. Hamann P. Harper, P. Martinez', 'box': [215, 271, 451, 287], 'linking': [[10, 11]], 'label': 'answer', 'words': [{'text': 'M.', 'box': [215, 272, 230, 287]}, {'text': 'Hamann', 'box': [237, 272, 287, 286]}, {'text': 'P.', 'box': [293, 272, 307, 286]}, {'text': 'Harper,', 'box': [314, 274, 363, 285]}, {'text': 'P.', 'box': [370, 272, 384, 285]}, {'text': 'Martinez', 'box': [390, 271, 451, 282]}], 'id': 11}\n", + "{'text': '9/ 3/ 92', 'box': [543, 264, 590, 279], 'linking': [[3, 12]], 'label': 'answer', 'words': [{'text': '9/', 'box': [543, 265, 560, 279]}, {'text': '3/', 'box': [560, 264, 575, 279]}, {'text': '92', 'box': [575, 264, 590, 279]}], 'id': 12}\n", + "{'text': 'R&D Group:', 'box': [420, 310, 491, 323], 'linking': [[13, 4]], 'label': 'question', 'words': [{'text': 'R&D', 'box': [420, 310, 442, 323]}, {'text': 'Group:', 'box': [448, 310, 491, 323]}], 'id': 13}\n", + "{'text': 'J. S. Wigand', 'box': [236, 313, 327, 327], 'linking': [[15, 14]], 'label': 'answer', 'words': [{'text': 'J.', 'box': [236, 313, 251, 327]}, {'text': 'S.', 'box': [256, 313, 273, 326]}, {'text': 'Wigand', 'box': [278, 313, 327, 327]}], 'id': 14}\n", + "{'text': 'Supervisor / Manager', 'box': [91, 316, 218, 331], 'linking': [[15, 14]], 'label': 'question', 'words': [{'text': 'Supervisor', 'box': [91, 316, 161, 330]}, {'text': '/', 'box': [163, 318, 169, 331]}, {'text': 'Manager', 'box': [169, 317, 218, 327]}], 'id': 15}\n", + "{'text': 'Discontinue coal retention analyses on licensee submitted product samples (Note : Coal Retention testing is not performed by most licensees. Other B&W physical measurements as ends stability and inspection for soft spots in ciparettes are thought to be sufficient measures to assure cigarette physical integrity. The proposed action will increase laboratory productivity . )', 'box': [190, 346, 594, 447], 'linking': [[2, 16]], 'label': 'answer', 'words': [{'text': 'Discontinue', 'box': [190, 355, 268, 366]}, {'text': 'coal', 'box': [274, 353, 303, 366]}, {'text': 'retention', 'box': [309, 352, 375, 365]}, {'text': 'analyses', 'box': [381, 351, 435, 365]}, {'text': 'on', 'box': [443, 352, 458, 363]}, {'text': 'licensee', 'box': [464, 348, 520, 362]}, {'text': 'submitted', 'box': [527, 346, 594, 361]}, {'text': 'product', 'box': [190, 369, 240, 383]}, {'text': 'samples', 'box': [247, 367, 301, 380]}, {'text': '(Note', 'box': [318, 365, 352, 379]}, {'text': ':', 'box': [352, 367, 359, 380]}, {'text': 'Coal', 'box': [373, 366, 402, 376]}, {'text': 'Retention', 'box': [408, 366, 472, 376]}, {'text': 'testing', 'box': [479, 365, 529, 376]}, {'text': 'is', 'box': [536, 363, 549, 374]}, {'text': 'not', 'box': [554, 363, 578, 374]}, {'text': 'performed', 'box': [190, 383, 256, 394]}, {'text': 'by', 'box': [261, 381, 275, 394]}, {'text': 'most', 'box': [282, 383, 311, 393]}, {'text': 'licensees.', 'box': [318, 380, 386, 391]}, {'text': 'Other', 'box': [401, 378, 437, 389]}, {'text': 'B&W', 'box': [443, 378, 465, 389]}, {'text': 'physical', 'box': [471, 377, 528, 391]}, {'text': 'measurements', 'box': [191, 398, 275, 406]}, {'text': 'as', 'box': [282, 397, 297, 405]}, {'text': 'ends', 'box': [304, 394, 332, 405]}, {'text': 'stability', 'box': [339, 394, 402, 405]}, {'text': 'and', 'box': [409, 392, 430, 402]}, {'text': 'inspection', 'box': [437, 392, 508, 403]}, {'text': 'for', 'box': [515, 391, 535, 402]}, {'text': 'soft', 'box': [542, 391, 571, 401]}, {'text': 'spots', 'box': [193, 411, 228, 422]}, {'text': 'in', 'box': [235, 409, 250, 420]}, {'text': 'ciparettes', 'box': [256, 409, 327, 419]}, {'text': 'are', 'box': [332, 408, 352, 418]}, {'text': 'thought', 'box': [360, 406, 410, 419]}, {'text': 'to', 'box': [415, 406, 430, 416]}, {'text': 'be', 'box': [436, 404, 453, 417]}, {'text': 'sufficient', 'box': [458, 405, 529, 415]}, {'text': 'measures', 'box': [535, 405, 592, 415]}, {'text': 'to', 'box': [193, 425, 208, 433]}, {'text': 'assure', 'box': [214, 423, 255, 431]}, {'text': 'cigarette', 'box': [261, 420, 325, 434]}, {'text': 'physical', 'box': [331, 419, 390, 432]}, {'text': 'integrity.', 'box': [395, 418, 463, 431]}, {'text': 'The', 'box': [478, 416, 500, 429]}, {'text': 'proposed', 'box': [506, 418, 566, 431]}, {'text': 'action', 'box': [193, 436, 236, 447]}, {'text': 'will', 'box': [240, 436, 269, 447]}, {'text': 'increase', 'box': [277, 434, 333, 445]}, {'text': 'laboratory', 'box': [339, 433, 410, 446]}, {'text': 'productivity', 'box': [418, 430, 502, 445]}, {'text': '.', 'box': [503, 433, 507, 444]}, {'text': ')', 'box': [508, 430, 514, 444]}], 'id': 16}\n", + "{'text': 'Suggested Solutions (s) :', 'box': [95, 486, 250, 504], 'linking': [[17, 18]], 'label': 'question', 'words': [{'text': 'Suggested', 'box': [95, 489, 159, 504]}, {'text': 'Solutions', 'box': [165, 487, 222, 501]}, {'text': '(s)', 'box': [223, 486, 241, 503]}, {'text': ':', 'box': [243, 489, 250, 503]}], 'id': 17}\n", + "{'text': 'Delete coal retention from the list of standard analyses performed on licensee submitted product samples. Special requests for coal retention testing could still be submitted on an exception basis.', 'box': [263, 483, 593, 553], 'linking': [[17, 18]], 'label': 'answer', 'words': [{'text': 'Delete', 'box': [263, 486, 306, 500]}, {'text': 'coal', 'box': [313, 486, 341, 499]}, {'text': 'retention', 'box': [348, 486, 412, 497]}, {'text': 'from', 'box': [416, 485, 447, 498]}, {'text': 'the', 'box': [453, 485, 475, 498]}, {'text': 'list', 'box': [480, 483, 508, 496]}, {'text': 'of', 'box': [515, 483, 532, 494]}, {'text': 'standard', 'box': [536, 483, 593, 494]}, {'text': 'analyses', 'box': [264, 501, 320, 514]}, {'text': 'performed', 'box': [324, 501, 392, 512]}, {'text': 'on', 'box': [397, 501, 412, 511]}, {'text': 'licensee', 'box': [419, 499, 475, 512]}, {'text': 'submitted', 'box': [482, 499, 546, 510]}, {'text': 'product', 'box': [264, 517, 314, 528]}, {'text': 'samples.', 'box': [320, 514, 374, 528]}, {'text': 'Special', 'box': [390, 513, 439, 526]}, {'text': 'requests', 'box': [446, 513, 502, 524]}, {'text': 'for', 'box': [508, 511, 530, 522]}, {'text': 'coal', 'box': [538, 510, 566, 523]}, {'text': 'retention', 'box': [263, 529, 330, 540]}, {'text': 'testing', 'box': [335, 527, 387, 540]}, {'text': 'could', 'box': [390, 527, 428, 538]}, {'text': 'still', 'box': [433, 525, 468, 536]}, {'text': 'be', 'box': [473, 525, 488, 535]}, {'text': 'submitted', 'box': [496, 524, 560, 537]}, {'text': 'on', 'box': [566, 524, 584, 537]}, {'text': 'an', 'box': [264, 543, 281, 553]}, {'text': 'exception', 'box': [286, 539, 350, 553]}, {'text': 'basis.', 'box': [355, 541, 397, 551]}], 'id': 18}\n", + "{'text': 'Have you contacted your Manager/ Supervisor?', 'box': [96, 608, 398, 624], 'linking': [[19, 6], [19, 7]], 'label': 'header', 'words': [{'text': 'Have', 'box': [96, 612, 127, 623]}, {'text': 'you', 'box': [131, 613, 156, 624]}, {'text': 'contacted', 'box': [161, 612, 225, 623]}, {'text': 'your', 'box': [229, 610, 260, 623]}, {'text': 'Manager/', 'box': [264, 609, 314, 622]}, {'text': '', 'box': [314, 608, 322, 622]}, {'text': 'Supervisor?', 'box': [323, 608, 398, 621]}], 'id': 19}\n", + "{'text': 'Manager Comments:', 'box': [98, 651, 211, 665], 'linking': [[20, 21], [20, 22]], 'label': 'question', 'words': [{'text': 'Manager', 'box': [98, 654, 150, 665]}, {'text': 'Comments:', 'box': [154, 651, 211, 664]}], 'id': 20}\n", + "{'text': 'Manager, please contact suggester and forward', 'box': [232, 644, 547, 662], 'linking': [[20, 21]], 'label': 'answer', 'words': [{'text': 'Manager,', 'box': [232, 648, 288, 662]}, {'text': 'please', 'box': [296, 649, 338, 662]}, {'text': 'contact', 'box': [344, 648, 394, 662]}, {'text': 'suggester', 'box': [401, 648, 464, 661]}, {'text': 'and', 'box': [469, 647, 491, 658]}, {'text': 'forward', 'box': [497, 644, 547, 657]}], 'id': 21}\n", + "{'text': 'comments to the Quality Council.', 'box': [99, 662, 323, 677], 'linking': [[20, 22]], 'label': 'answer', 'words': [{'text': 'comments', 'box': [99, 666, 155, 677]}, {'text': 'to', 'box': [162, 665, 177, 676]}, {'text': 'the', 'box': [183, 665, 205, 675]}, {'text': 'Quality', 'box': [211, 663, 261, 676]}, {'text': 'Council.', 'box': [267, 662, 323, 676]}], 'id': 22}\n", + "{'text': 'qip . wp', 'box': [102, 823, 145, 838], 'linking': [], 'label': 'other', 'words': [{'text': 'qip', 'box': [102, 824, 123, 837]}, {'text': '.', 'box': [124, 824, 130, 838]}, {'text': 'wp', 'box': [130, 823, 145, 837]}], 'id': 23}\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "with open(os.path.join(base_path, \"training_data/annotations/0000971160.json\")) as f:\n", + " data = json.load(f)\n", + "\n", + "for annotation in data[\"form\"]:\n", + " print(annotation)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hs4L3S5a2Gfb" + }, + "source": [ + "The PIL library has a handy ImageDraw module, which -you guessed it- allows to draw things (such as rectangles) on an image:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "gWaHFM_LtKPP", + "outputId": "c498e560-035f-4170-b0b9-85ba3956711c" + }, + "outputs": [ + { + "data": { + "image/jpeg": "", + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "draw = ImageDraw.Draw(image, \"RGBA\")\n", + "\n", + "font = ImageFont.load_default()\n", + "\n", + "label2color = {\"question\": \"blue\", \"answer\": \"green\", \"header\": \"orange\", \"other\": \"violet\"}\n", + "\n", + "for annotation in data[\"form\"]:\n", + " label = annotation[\"label\"]\n", + " general_box = annotation[\"box\"]\n", + " draw.rectangle(general_box, outline=label2color[label], width=2)\n", + " draw.text((general_box[0] + 10, general_box[1] - 10), label, fill=label2color[label], font=font)\n", + " words = annotation[\"words\"]\n", + " for word in words:\n", + " box = word[\"box\"]\n", + " draw.rectangle(box, outline=label2color[label], width=1)\n", + "\n", + "image" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uyWQNLSCRJN7" + }, + "source": [ + "## Preprocessing the data\n", + "\n", + "Next, we need to turn the document images into individual tokens and corresponding labels (BIOES format, see further). We do this both for the training and test datasets. Make sure to run this from the `/content` directory:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4DWRyOR9RuY6", + "outputId": "4215a24b-8049-4b1a-a23f-5aaa48e14083" + }, + "outputs": [], + "source": [ + "! python unilm/layoutlm/deprecated/examples/seq_labeling/preprocess.py --data_dir data/training_data/annotations \\\n", + " --data_split train \\\n", + " --output_dir data \\\n", + " --model_name_or_path microsoft/layoutlm-base-uncased \\\n", + " --max_len 510\n", + "\n", + "! python unilm/layoutlm/deprecated/examples/seq_labeling/preprocess.py --data_dir data/testing_data/annotations \\\n", + " --data_split test \\\n", + " --output_dir data \\\n", + " --model_name_or_path microsoft/layoutlm-base-uncased \\\n", + " --max_len 510" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gc4Cu0ZyO5M_" + }, + "source": [ + "Next, we create a labels.txt file that contains the unique labels of the FUNSD dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8iGOU0s3UR2u" + }, + "outputs": [], + "source": [ + "! cat data/train.txt | cut -d$'\\t' -f 2 | grep -v \"^$\"| sort | uniq > data/labels.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mC9FhkG9U8yg" + }, + "source": [ + "## Define a PyTorch dataset\n", + "\n", + "First, we create a list containing the unique labels based on `data/labels.txt` (run this from the content directory):" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "675rRa0QXnMp" + }, + "outputs": [], + "source": [ + "from torch.nn import CrossEntropyLoss\n", + "\n", + "\n", + "def get_labels(path):\n", + " with open(path, \"r\") as f:\n", + " labels = f.read().splitlines()\n", + " if \"O\" not in labels:\n", + " labels = [\"O\"] + labels\n", + " return labels\n", + "\n", + "\n", + "labels = get_labels(\"data/labels.txt\")\n", + "num_labels = len(labels)\n", + "label_map = {i: label for i, label in enumerate(labels)}\n", + "# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later\n", + "pad_token_label_id = CrossEntropyLoss().ignore_index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kZ2LGEsez2u2" + }, + "source": [ + "We can see that the dataset uses the so-called BIOES annotation scheme to annotate the tokens. This means that a given token can be either at the beginning (B), inside (I), outside (O), at the end (E) or start (S) of a given entity. Entities include ANSWER, QUESTION, HEADER and OTHER: " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_-qXLkP9Yq_L", + "outputId": "32ab46a4-4cf0-400c-816b-570f950035ec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['B-ANSWER', 'B-HEADER', 'B-QUESTION', 'E-ANSWER', 'E-HEADER', 'E-QUESTION', 'I-ANSWER', 'I-HEADER', 'I-QUESTION', 'O', 'S-ANSWER', 'S-HEADER', 'S-QUESTION']\n" + ] + } + ], + "source": [ + "print(labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9_ck0ZFfZInR" + }, + "source": [ + "Next, we can create a PyTorch dataset and corresponding dataloader (both for training and evaluation):" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "\n", + "import torch\n", + "from torch.utils.data import Dataset\n", + "\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "\n", + "class FunsdDataset(Dataset):\n", + " def __init__(self, args, tokenizer, labels, pad_token_label_id, mode):\n", + " if args.local_rank not in [-1, 0] and mode == \"train\":\n", + " torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache\n", + "\n", + " # Load data features from cache or dataset file\n", + " cached_features_file = os.path.join(\n", + " args.data_dir,\n", + " \"cached_{}_{}_{}\".format(\n", + " mode,\n", + " list(filter(None, args.model_name_or_path.split(\"/\"))).pop(),\n", + " str(args.max_seq_length),\n", + " ),\n", + " )\n", + " if os.path.exists(cached_features_file) and not args.overwrite_cache:\n", + " logger.info(\"Loading features from cached file %s\", cached_features_file)\n", + " features = torch.load(cached_features_file)\n", + " else:\n", + " logger.info(\"Creating features from dataset file at %s\", args.data_dir)\n", + " examples = read_examples_from_file(args.data_dir, mode)\n", + " features = convert_examples_to_features(\n", + " examples,\n", + " labels,\n", + " args.max_seq_length,\n", + " tokenizer,\n", + " cls_token_at_end=bool(args.model_type in [\"xlnet\"]),\n", + " # xlnet has a cls token at the end\n", + " cls_token=tokenizer.cls_token,\n", + " cls_token_segment_id=2 if args.model_type in [\"xlnet\"] else 0,\n", + " sep_token=tokenizer.sep_token,\n", + " sep_token_extra=bool(args.model_type in [\"roberta\"]),\n", + " # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805\n", + " pad_on_left=bool(args.model_type in [\"xlnet\"]),\n", + " # pad on the left for xlnet\n", + " pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],\n", + " pad_token_segment_id=4 if args.model_type in [\"xlnet\"] else 0,\n", + " pad_token_label_id=pad_token_label_id,\n", + " )\n", + " # if args.local_rank in [-1, 0]:\n", + " # logger.info(\"Saving features into cached file %s\", cached_features_file)\n", + " # torch.save(features, cached_features_file)\n", + "\n", + " if args.local_rank == 0 and mode == \"train\":\n", + " torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache\n", + "\n", + " self.features = features\n", + " # Convert to Tensors and build dataset\n", + " self.all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", + " self.all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n", + " self.all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n", + " self.all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)\n", + " self.all_bboxes = torch.tensor([f.boxes for f in features], dtype=torch.long)\n", + "\n", + " def __len__(self):\n", + " return len(self.features)\n", + "\n", + " def __getitem__(self, index):\n", + " return (\n", + " self.all_input_ids[index],\n", + " self.all_input_mask[index],\n", + " self.all_segment_ids[index],\n", + " self.all_label_ids[index],\n", + " self.all_bboxes[index],\n", + " )\n", + "\n", + "\n", + "class InputExample(object):\n", + " \"\"\"A single training/test example for token classification.\"\"\"\n", + "\n", + " def __init__(self, guid, words, labels, boxes, actual_bboxes, file_name, page_size):\n", + " \"\"\"Constructs a InputExample.\n", + "\n", + " Args:\n", + " guid: Unique id for the example.\n", + " words: list. The words of the sequence.\n", + " labels: (Optional) list. The labels for each word of the sequence. This should be\n", + " specified for train and dev examples, but not for test examples.\n", + " \"\"\"\n", + " self.guid = guid\n", + " self.words = words\n", + " self.labels = labels\n", + " self.boxes = boxes\n", + " self.actual_bboxes = actual_bboxes\n", + " self.file_name = file_name\n", + " self.page_size = page_size\n", + "\n", + "\n", + "class InputFeatures(object):\n", + " \"\"\"A single set of features of data.\"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " input_ids,\n", + " input_mask,\n", + " segment_ids,\n", + " label_ids,\n", + " boxes,\n", + " actual_bboxes,\n", + " file_name,\n", + " page_size,\n", + " ):\n", + " assert (\n", + " 0 <= all(boxes) <= 1000\n", + " ), \"Error with input bbox ({}): the coordinate value is not between 0 and 1000\".format(boxes)\n", + " self.input_ids = input_ids\n", + " self.input_mask = input_mask\n", + " self.segment_ids = segment_ids\n", + " self.label_ids = label_ids\n", + " self.boxes = boxes\n", + " self.actual_bboxes = actual_bboxes\n", + " self.file_name = file_name\n", + " self.page_size = page_size\n", + "\n", + "\n", + "def read_examples_from_file(data_dir, mode):\n", + " file_path = os.path.join(data_dir, \"{}.txt\".format(mode))\n", + " box_file_path = os.path.join(data_dir, \"{}_box.txt\".format(mode))\n", + " image_file_path = os.path.join(data_dir, \"{}_image.txt\".format(mode))\n", + " guid_index = 1\n", + " examples = []\n", + " with open(file_path, encoding=\"utf-8\") as f, open(box_file_path, encoding=\"utf-8\") as fb, open(\n", + " image_file_path, encoding=\"utf-8\"\n", + " ) as fi:\n", + " words = []\n", + " boxes = []\n", + " actual_bboxes = []\n", + " file_name = None\n", + " page_size = None\n", + " labels = []\n", + " for line, bline, iline in zip(f, fb, fi):\n", + " if line.startswith(\"-DOCSTART-\") or line == \"\" or line == \"\\n\":\n", + " if words:\n", + " examples.append(\n", + " InputExample(\n", + " guid=\"{}-{}\".format(mode, guid_index),\n", + " words=words,\n", + " labels=labels,\n", + " boxes=boxes,\n", + " actual_bboxes=actual_bboxes,\n", + " file_name=file_name,\n", + " page_size=page_size,\n", + " )\n", + " )\n", + " guid_index += 1\n", + " words = []\n", + " boxes = []\n", + " actual_bboxes = []\n", + " file_name = None\n", + " page_size = None\n", + " labels = []\n", + " else:\n", + " splits = line.split(\"\\t\")\n", + " bsplits = bline.split(\"\\t\")\n", + " isplits = iline.split(\"\\t\")\n", + " assert len(splits) == 2\n", + " assert len(bsplits) == 2\n", + " assert len(isplits) == 4\n", + " assert splits[0] == bsplits[0]\n", + " words.append(splits[0])\n", + " if len(splits) > 1:\n", + " labels.append(splits[-1].replace(\"\\n\", \"\"))\n", + " box = bsplits[-1].replace(\"\\n\", \"\")\n", + " box = [int(b) for b in box.split()]\n", + " boxes.append(box)\n", + " actual_bbox = [int(b) for b in isplits[1].split()]\n", + " actual_bboxes.append(actual_bbox)\n", + " page_size = [int(i) for i in isplits[2].split()]\n", + " file_name = isplits[3].strip()\n", + " else:\n", + " # Examples could have no label for mode = \"test\"\n", + " labels.append(\"O\")\n", + " if words:\n", + " examples.append(\n", + " InputExample(\n", + " guid=\"%s-%d\".format(mode, guid_index),\n", + " words=words,\n", + " labels=labels,\n", + " boxes=boxes,\n", + " actual_bboxes=actual_bboxes,\n", + " file_name=file_name,\n", + " page_size=page_size,\n", + " )\n", + " )\n", + " return examples\n", + "\n", + "\n", + "def convert_examples_to_features(\n", + " examples,\n", + " label_list,\n", + " max_seq_length,\n", + " tokenizer,\n", + " cls_token_at_end=False,\n", + " cls_token=\"[CLS]\",\n", + " cls_token_segment_id=1,\n", + " sep_token=\"[SEP]\",\n", + " sep_token_extra=False,\n", + " pad_on_left=False,\n", + " pad_token=0,\n", + " cls_token_box=[0, 0, 0, 0],\n", + " sep_token_box=[1000, 1000, 1000, 1000],\n", + " pad_token_box=[0, 0, 0, 0],\n", + " pad_token_segment_id=0,\n", + " pad_token_label_id=-1,\n", + " sequence_a_segment_id=0,\n", + " mask_padding_with_zero=True,\n", + "):\n", + " \"\"\"Loads a data file into a list of `InputBatch`s\n", + " `cls_token_at_end` define the location of the CLS token:\n", + " - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]\n", + " - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]\n", + " `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)\n", + " \"\"\"\n", + "\n", + " label_map = {label: i for i, label in enumerate(label_list)}\n", + "\n", + " features = []\n", + " for ex_index, example in enumerate(examples):\n", + " file_name = example.file_name\n", + " page_size = example.page_size\n", + " width, height = page_size\n", + " if ex_index % 10000 == 0:\n", + " logger.info(\"Writing example %d of %d\", ex_index, len(examples))\n", + "\n", + " tokens = []\n", + " token_boxes = []\n", + " actual_bboxes = []\n", + " label_ids = []\n", + " for word, label, box, actual_bbox in zip(example.words, example.labels, example.boxes, example.actual_bboxes):\n", + " word_tokens = tokenizer.tokenize(word)\n", + " tokens.extend(word_tokens)\n", + " token_boxes.extend([box] * len(word_tokens))\n", + " actual_bboxes.extend([actual_bbox] * len(word_tokens))\n", + " # Use the real label id for the first token of the word, and padding ids for the remaining tokens\n", + " label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))\n", + "\n", + " # Account for [CLS] and [SEP] with \"- 2\" and with \"- 3\" for RoBERTa.\n", + " special_tokens_count = 3 if sep_token_extra else 2\n", + " if len(tokens) > max_seq_length - special_tokens_count:\n", + " tokens = tokens[: (max_seq_length - special_tokens_count)]\n", + " token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]\n", + " actual_bboxes = actual_bboxes[: (max_seq_length - special_tokens_count)]\n", + " label_ids = label_ids[: (max_seq_length - special_tokens_count)]\n", + "\n", + " # The convention in BERT is:\n", + " # (a) For sequence pairs:\n", + " # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n", + " # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n", + " # (b) For single sequences:\n", + " # tokens: [CLS] the dog is hairy . [SEP]\n", + " # type_ids: 0 0 0 0 0 0 0\n", + " #\n", + " # Where \"type_ids\" are used to indicate whether this is the first\n", + " # sequence or the second sequence. The embedding vectors for `type=0` and\n", + " # `type=1` were learned during pre-training and are added to the wordpiece\n", + " # embedding vector (and position vector). This is not *strictly* necessary\n", + " # since the [SEP] token unambiguously separates the sequences, but it makes\n", + " # it easier for the model to learn the concept of sequences.\n", + " #\n", + " # For classification tasks, the first vector (corresponding to [CLS]) is\n", + " # used as as the \"sentence vector\". Note that this only makes sense because\n", + " # the entire model is fine-tuned.\n", + " tokens += [sep_token]\n", + " token_boxes += [sep_token_box]\n", + " actual_bboxes += [[0, 0, width, height]]\n", + " label_ids += [pad_token_label_id]\n", + " if sep_token_extra:\n", + " # roberta uses an extra separator b/w pairs of sentences\n", + " tokens += [sep_token]\n", + " token_boxes += [sep_token_box]\n", + " actual_bboxes += [[0, 0, width, height]]\n", + " label_ids += [pad_token_label_id]\n", + " segment_ids = [sequence_a_segment_id] * len(tokens)\n", + "\n", + " if cls_token_at_end:\n", + " tokens += [cls_token]\n", + " token_boxes += [cls_token_box]\n", + " actual_bboxes += [[0, 0, width, height]]\n", + " label_ids += [pad_token_label_id]\n", + " segment_ids += [cls_token_segment_id]\n", + " else:\n", + " tokens = [cls_token] + tokens\n", + " token_boxes = [cls_token_box] + token_boxes\n", + " actual_bboxes = [[0, 0, width, height]] + actual_bboxes\n", + " label_ids = [pad_token_label_id] + label_ids\n", + " segment_ids = [cls_token_segment_id] + segment_ids\n", + "\n", + " input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", + "\n", + " # The mask has 1 for real tokens and 0 for padding tokens. Only real\n", + " # tokens are attended to.\n", + " input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)\n", + "\n", + " # Zero-pad up to the sequence length.\n", + " padding_length = max_seq_length - len(input_ids)\n", + " if pad_on_left:\n", + " input_ids = ([pad_token] * padding_length) + input_ids\n", + " input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask\n", + " segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids\n", + " label_ids = ([pad_token_label_id] * padding_length) + label_ids\n", + " token_boxes = ([pad_token_box] * padding_length) + token_boxes\n", + " else:\n", + " input_ids += [pad_token] * padding_length\n", + " input_mask += [0 if mask_padding_with_zero else 1] * padding_length\n", + " segment_ids += [pad_token_segment_id] * padding_length\n", + " label_ids += [pad_token_label_id] * padding_length\n", + " token_boxes += [pad_token_box] * padding_length\n", + "\n", + " assert len(input_ids) == max_seq_length\n", + " assert len(input_mask) == max_seq_length\n", + " assert len(segment_ids) == max_seq_length\n", + " assert len(label_ids) == max_seq_length\n", + " assert len(token_boxes) == max_seq_length\n", + "\n", + " if ex_index < 5:\n", + " logger.info(\"*** Example ***\")\n", + " logger.info(\"guid: %s\", example.guid)\n", + " logger.info(\"tokens: %s\", \" \".join([str(x) for x in tokens]))\n", + " logger.info(\"input_ids: %s\", \" \".join([str(x) for x in input_ids]))\n", + " logger.info(\"input_mask: %s\", \" \".join([str(x) for x in input_mask]))\n", + " logger.info(\"segment_ids: %s\", \" \".join([str(x) for x in segment_ids]))\n", + " logger.info(\"label_ids: %s\", \" \".join([str(x) for x in label_ids]))\n", + " logger.info(\"boxes: %s\", \" \".join([str(x) for x in token_boxes]))\n", + " logger.info(\"actual_bboxes: %s\", \" \".join([str(x) for x in actual_bboxes]))\n", + "\n", + " features.append(\n", + " InputFeatures(\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " segment_ids=segment_ids,\n", + " label_ids=label_ids,\n", + " boxes=token_boxes,\n", + " actual_bboxes=actual_bboxes,\n", + " file_name=file_name,\n", + " page_size=page_size,\n", + " )\n", + " )\n", + " return features" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "HUJftzeBWh2S" + }, + "outputs": [], + "source": [ + "from transformers import LayoutLMTokenizer\n", + "\n", + "# from .unilm.layoutlm.data.funsd import FunsdDataset, InputFeatures\n", + "from torch.utils.data import DataLoader, RandomSampler, SequentialSampler\n", + "\n", + "batch_size = 16\n", + "args = {\n", + " \"local_rank\": -1,\n", + " \"overwrite_cache\": True,\n", + " \"data_dir\": \"data/\",\n", + " \"model_name_or_path\": \"microsoft/layoutlm-base-uncased\",\n", + " \"max_seq_length\": 512,\n", + " \"model_type\": \"layoutlm\",\n", + "}\n", + "\n", + "\n", + "# class to turn the keys of a dict into attributes (thanks Stackoverflow)\n", + "class AttrDict(dict):\n", + " def __init__(self, *args, **kwargs):\n", + " super(AttrDict, self).__init__(*args, **kwargs)\n", + " self.__dict__ = self\n", + "\n", + "\n", + "args = AttrDict(args)\n", + "\n", + "tokenizer = LayoutLMTokenizer.from_pretrained(\"microsoft/layoutlm-base-uncased\")\n", + "\n", + "# the LayoutLM authors already defined a specific FunsdDataset, so we are going to use this here\n", + "train_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode=\"train\")\n", + "train_sampler = RandomSampler(train_dataset)\n", + "train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)\n", + "\n", + "eval_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode=\"test\")\n", + "eval_sampler = SequentialSampler(eval_dataset)\n", + "eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "18NMUBzgOdqu", + "outputId": "eef47b70-3a9a-4b19-be6b-95900c58337b" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(train_dataloader)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "toFjxtn71B1U", + "outputId": "f4651896-cafc-449a-98b4-c81f41177e6d" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(eval_dataloader)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 137 + }, + "id": "RhINSBw9I24G", + "outputId": "28738ce2-617c-47d3-b8c9-f949d3066d60" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'[CLS] account agency ss : date 1 / 31 / 88 insert signed 91581919 4th new albany tribune advertisements. bookkeeper mar 08 reco affidavit of performance from newspaper new albany tribune harley davidson cigarettes lorillard media service state of county of indiana ) floyd ) before me a notary public, personally appeared holly inzer who being duly sworn, says that ( he ) ( she ) is of the abovementioned newspaper and that display ads for the above account were made through the aforesaid newspaper during the month of january, 1988 as follows : column inches exclusive advertising for harley davidson cigarettes we hereby certify charges shown above on dates per attached bill are true and correct as billed to the account in upper right hand corner of the affidavit and are exclusive sworn to and subscribed before me this day of march, 1988 in testimony whereof i have set my hand and seal the day and year aforesaid. my commission expires : 2 - 9 - 90 betty j. murphy ( notary public}, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch = next(iter(train_dataloader))\n", + "input_ids = batch[0][0]\n", + "tokenizer.decode(input_ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "66cEmLDoUFcm" + }, + "source": [ + "## Define and fine-tune the model\n", + "\n", + "As this is a sequence labeling task, we are going to load `LayoutLMForTokenClassification` (the base sized model) from the hub. We are going to fine-tune it on a downstream task, namely FUNSD." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LoraConfig(task_type=, peft_type=, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules=None, exclude_modules=None, lora_alpha=16, lora_dropout=0.1, fan_in_fan_out=False, bias='all', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from peft import get_peft_config, PeftModel, get_peft_model, LoraConfig, TaskType\n", + "\n", + "peft_config = LoraConfig(\n", + " task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias=\"all\"\n", + ")\n", + "peft_config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "da1094982d044ab28eb0effebbfcbb78", + "513e00b619924f5693259cd919a927ab", + "63e819a04f6e4829838c0e30e65516ed", + "d1c3e1a66db04227a74ef8d6481d6daf", + "c75f0da13a1e4dbe94800711d55390a6", + "31642aacae2a44879960da09f938ecc4", + "138a6b922e454ebbaeb315ecd5f476b8", + "a8126ba98376402888e9ba344cf1c538" + ] + }, + "id": "xIdOsFBiTsuw", + "outputId": "95e8811c-025a-41a0-9d03-4285a17f2a9b" + }, + "outputs": [], + "source": [ + "from transformers import LayoutLMForTokenClassification\n", + "import torch\n", + "from transformers import set_seed\n", + "\n", + "seed = 100\n", + "set_seed(seed)\n", + "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n", + "\n", + "model = LayoutLMForTokenClassification.from_pretrained(\"microsoft/layoutlm-base-uncased\", num_labels=num_labels)\n", + "model = get_peft_model(model, peft_config)\n", + "model.to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(model.model.layoutlm.encoder.layer[0].attention.self.query.weight)\n", + "print(model.model.layoutlm.encoder.layer[0].attention.self.query.lora_A.default.weight)\n", + "print(model.model.classifier.weight)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3weFr_pz1mla" + }, + "source": [ + "Now we can start training:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Yu0qePs2cRKo", + "outputId": "cdbb9a03-eb9b-4740-bbe3-da06b9192bae" + }, + "outputs": [], + "source": [ + "from transformers import get_linear_schedule_with_warmup\n", + "from tqdm import tqdm\n", + "\n", + "num_train_epochs = 100\n", + "\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3)\n", + "lr_scheduler = get_linear_schedule_with_warmup(\n", + " optimizer=optimizer,\n", + " num_warmup_steps=0.06 * (len(train_dataloader) * num_train_epochs),\n", + " num_training_steps=(len(train_dataloader) * num_train_epochs),\n", + ")\n", + "\n", + "\n", + "global_step = 0\n", + "\n", + "t_total = len(train_dataloader) * num_train_epochs # total number of training steps\n", + "\n", + "# put the model in training mode\n", + "model.train()\n", + "for epoch in range(num_train_epochs):\n", + " for batch in tqdm(train_dataloader, desc=\"Training\"):\n", + " input_ids = batch[0].to(device)\n", + " bbox = batch[4].to(device)\n", + " attention_mask = batch[1].to(device)\n", + " token_type_ids = batch[2].to(device)\n", + " labels = batch[3].to(device)\n", + "\n", + " # forward pass\n", + " outputs = model(\n", + " input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels\n", + " )\n", + " loss = outputs.loss\n", + "\n", + " # print loss every 100 steps\n", + " if global_step % 10 == 0:\n", + " print(f\"Loss after {global_step} steps: {loss.item()}\")\n", + "\n", + " # backward pass to get the gradients\n", + " loss.backward()\n", + "\n", + " # print(\"Gradients on classification head:\")\n", + " # print(model.classifier.weight.grad[6,:].sum())\n", + "\n", + " # update\n", + " optimizer.step()\n", + " lr_scheduler.step()\n", + " optimizer.zero_grad()\n", + " global_step += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "u1rNslap5Y3N", + "outputId": "877183d4-1d29-4d09-bd3a-0e5f88611dc8" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluating: 100%|██████████| 4/4 [00:00<00:00, 10.90it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'loss': 1.467050313949585, 'precision': 0.7295341474445952, 'recall': 0.806903451725863, 'f1': 0.7662707838479811}\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from seqeval.metrics import (\n", + " classification_report,\n", + " f1_score,\n", + " precision_score,\n", + " recall_score,\n", + ")\n", + "\n", + "eval_loss = 0.0\n", + "nb_eval_steps = 0\n", + "preds = None\n", + "out_label_ids = None\n", + "\n", + "# put model in evaluation mode\n", + "model.eval()\n", + "for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n", + " with torch.no_grad():\n", + " input_ids = batch[0].to(device)\n", + " bbox = batch[4].to(device)\n", + " attention_mask = batch[1].to(device)\n", + " token_type_ids = batch[2].to(device)\n", + " labels = batch[3].to(device)\n", + "\n", + " # forward pass\n", + " outputs = model(\n", + " input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels\n", + " )\n", + " # get the loss and logits\n", + " tmp_eval_loss = outputs.loss\n", + " logits = outputs.logits\n", + "\n", + " eval_loss += tmp_eval_loss.item()\n", + " nb_eval_steps += 1\n", + "\n", + " # compute the predictions\n", + " if preds is None:\n", + " preds = logits.detach().cpu().numpy()\n", + " out_label_ids = labels.detach().cpu().numpy()\n", + " else:\n", + " preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)\n", + " out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)\n", + "\n", + "# compute average evaluation loss\n", + "eval_loss = eval_loss / nb_eval_steps\n", + "preds = np.argmax(preds, axis=2)\n", + "\n", + "out_label_list = [[] for _ in range(out_label_ids.shape[0])]\n", + "preds_list = [[] for _ in range(out_label_ids.shape[0])]\n", + "\n", + "for i in range(out_label_ids.shape[0]):\n", + " for j in range(out_label_ids.shape[1]):\n", + " if out_label_ids[i, j] != pad_token_label_id:\n", + " out_label_list[i].append(label_map[out_label_ids[i][j]])\n", + " preds_list[i].append(label_map[preds[i][j]])\n", + "\n", + "results = {\n", + " \"loss\": eval_loss,\n", + " \"precision\": precision_score(out_label_list, preds_list),\n", + " \"recall\": recall_score(out_label_list, preds_list),\n", + " \"f1\": f1_score(out_label_list, preds_list),\n", + "}\n", + "print(results)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "trainable params: 702,733 || all params: 113,237,786 || trainable%: 0.6206\n" + ] + } + ], + "source": [ + "model.print_trainable_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "model.save_pretrained(\"peft_layoutlm\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.8M\t./peft_layoutlm/adapter_model.safetensors\n" + ] + } + ], + "source": [ + "!du -h ./peft_layoutlm/adapter_model.safetensors" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "Fine-tuning LayoutLMForTokenClassification on FUNSD.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + }, + "vscode": { + "interpreter": { + "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "138a6b922e454ebbaeb315ecd5f476b8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "31642aacae2a44879960da09f938ecc4": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "513e00b619924f5693259cd919a927ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "63e819a04f6e4829838c0e30e65516ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "Downloading: 100%", + "description_tooltip": null, + "layout": "IPY_MODEL_31642aacae2a44879960da09f938ecc4", + "max": 453093832, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_c75f0da13a1e4dbe94800711d55390a6", + "value": 453093832 + } + }, + "a8126ba98376402888e9ba344cf1c538": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c75f0da13a1e4dbe94800711d55390a6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "initial" + } + }, + "d1c3e1a66db04227a74ef8d6481d6daf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a8126ba98376402888e9ba344cf1c538", + "placeholder": "​", + "style": "IPY_MODEL_138a6b922e454ebbaeb315ecd5f476b8", + "value": " 453M/453M [00:15<00:00, 30.0MB/s]" + } + }, + "da1094982d044ab28eb0effebbfcbb78": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_63e819a04f6e4829838c0e30e65516ed", + "IPY_MODEL_d1c3e1a66db04227a74ef8d6481d6daf" + ], + "layout": "IPY_MODEL_513e00b619924f5693259cd919a927ab" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/peft/examples/token_classification/requirements.txt b/peft/examples/token_classification/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cde65e15268b67b28864f3fd7e6932fb7949802 --- /dev/null +++ b/peft/examples/token_classification/requirements.txt @@ -0,0 +1,7 @@ +transformers +accelerate +evaluate +tqdm +datasets +Pillow +torchvision \ No newline at end of file diff --git a/peft/examples/waveft_finetuning/README.md b/peft/examples/waveft_finetuning/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ad2d231698ad24381e289f18a0fe81029d11930f --- /dev/null +++ b/peft/examples/waveft_finetuning/README.md @@ -0,0 +1,64 @@ + + +# WaveFT: Wavelet Fine-Tuning + +## Introduction +[WaveFT](https://arxiv.org/abs/2505.12532) is a novel parameter-efficient fine-tuning (PEFT) method that introduces sparse updates in the **wavelet domain** of residual matrices. Unlike LoRA, which is constrained by discrete low-rank choices, WaveFT enables fine-grained control over the number of trainable parameters by directly learning a sparse set of coefficients in the transformed space. These coefficients are then mapped back to the weight domain via the Inverse Discrete Wavelet Transform (IDWT), producing high-rank updates without incurring inference overhead. + +## Quick start +```python +import torch +from peft import WaveFTConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM +from trl import SFTConfig, SFTTrainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +dataset = load_dataset("imdb", split="train[:1%]") +waveft_config = WaveFTConfig( + n_frequency=2592, +) +peft_model = get_peft_model(model, waveft_config) +training_args = SFTConfig(dataset_text_field="text", max_seq_length=128) +trainer = SFTTrainer( + model=peft_model, + train_dataset=dataset, + processing_class=tokenizer, +) +trainer.train() +peft_model.save_pretrained("waveft-opt-350m") +``` + +For more options and a more detailed example code, you can refer to waveft finetuning script. +Run the script simply by running: +```bash +python3 examples/waveft_finetuning/waveft_finetuning.py --base_model facebook/opt-350m +``` + +If you want to run DDP by [accelerate](https://huggingface.co/docs/accelerate/en/index), please run `accelerate config` to set your ddp config, and run: +```bash +accelerate launch examples/waveft_finetuning/waveft_finetuning.py --base_model facebook/opt-350m +``` +please add `--device_map cpu` if you want to run finetune on CPU. + +## Use the model +You can load and use the model as any other 🤗 PEFT model +```python +from peft import PeftModel +from transformers import AutoTokenizer, AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m") +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +waveft_model = PeftModel.from_pretrained(model, "waveft-opt-350m") +``` + +## Citation +@misc{bilican2025exploringsparsityparameterefficient, + title={Exploring Sparsity for Parameter Efficient Fine Tuning Using Wavelets}, + author={Ahmet Bilican and M. Akın Yılmaz and A. Murat Tekalp and R. Gökberk Cinbiş}, + year={2025}, + eprint={2505.12532}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2505.12532}, +} \ No newline at end of file diff --git a/peft/examples/waveft_finetuning/waveft_finetuning.py b/peft/examples/waveft_finetuning/waveft_finetuning.py new file mode 100644 index 0000000000000000000000000000000000000000..1855f86a26132da9d914e5d5ac504bdef42d54b1 --- /dev/null +++ b/peft/examples/waveft_finetuning/waveft_finetuning.py @@ -0,0 +1,189 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +from typing import Optional + +import torch +import transformers +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed + +from peft import ( + WaveFTConfig, + get_peft_model, +) + + +def train( + base_model: str, + data_path: str = "yahma/alpaca-cleaned", + output_dir: str = "waveft", + batch_size: int = 16, + num_epochs: int = 1, + learning_rate: float = 3e-4, + cutoff_len: int = 256, + val_set_size: int = 16, + eval_step: int = 100, + save_step: int = 100, + device_map: str = "auto", + waveft_n_frequency: int = 2592, + waveft_target_modules: list[str] = None, + waveft_scaling: float = 25.0, + waveft_wavelet_family: str = "db1", + waveft_use_idwt: bool = True, + torch_dtype: str = "float16", + seed: Optional[int] = None, +): + # Set device_map to the right place when enabling DDP. + world_size = int(os.environ.get("WORLD_SIZE", 0)) or int(os.environ.get("PMI_SIZE", 0)) + if world_size > 1 and device_map != "cpu": + from accelerate import Accelerator + + device_map = {"": Accelerator().process_index} + # Set seed + if seed is not None: + set_seed(seed) + model_kwargs = {"dtype": getattr(torch, torch_dtype), "device_map": device_map} + model = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs) + + tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) + # For some tokenizer with no pad token like llama + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize(prompt, add_eos_token=True): + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + def generate_and_tokenize_prompt(example): + full_prompt = generate_prompt(example) + tokenized_full_prompt = tokenize(full_prompt) + return tokenized_full_prompt + + config = WaveFTConfig( + n_frequency=waveft_n_frequency, + scaling=waveft_scaling, + wavelet_family=waveft_wavelet_family, + use_idwt=waveft_use_idwt, + target_modules=waveft_target_modules, + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset(data_path) + + train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=True, seed=42) + train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) + + trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=batch_size, + warmup_steps=100, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + logging_steps=100, + optim="adamw_torch", + eval_strategy="steps", + save_strategy="steps", + eval_steps=eval_step, + save_steps=save_step, + output_dir=output_dir, + save_total_limit=3, + load_best_model_at_end=True, + ddp_find_unused_parameters=False if world_size > 1 else None, + ), + data_collator=transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + trainer.train() + model.save_pretrained(output_dir) + + +def generate_prompt(example): + return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. + ### Instruction: + {example["instruction"]} + ### Response: + {example["output"]}""" + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--base_model", type=str) + parser.add_argument("--data_path", type=str, default="yahma/alpaca-cleaned") + parser.add_argument("--output_dir", type=str, default="waveft") + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--num_epochs", type=int, default=1) + parser.add_argument("--learning_rate", type=float, default=3e-4) + parser.add_argument("--cutoff_len", type=int, default=256) + parser.add_argument("--val_set_size", type=int, default=16) + parser.add_argument("--eval_step", type=int, default=100) + parser.add_argument("--save_step", type=int, default=100) + parser.add_argument("--device_map", type=str, default="auto") + parser.add_argument("--waveft_n_frequency", type=int, default=2592) + parser.add_argument("--waveft_target_modules", type=str, default=None) + parser.add_argument("--waveft_scaling", type=float, default=25.0) + parser.add_argument("--waveft_wavelet_family", type=str, default="db1") + parser.add_argument("--waveft_use_idwt", action="store_true", default=True) + parser.add_argument("--torch_dtype", type=str, default="float16") + parser.add_argument("--seed", type=int, default=None) + + args = parser.parse_args() + + train( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + eval_step=args.eval_step, + save_step=args.save_step, + device_map=args.device_map, + waveft_n_frequency=args.waveft_n_frequency, + waveft_target_modules=args.waveft_target_modules, + waveft_scaling=args.waveft_scaling, + waveft_wavelet_family=args.waveft_wavelet_family, + waveft_use_idwt=args.waveft_use_idwt, + torch_dtype=args.torch_dtype, + seed=args.seed, + ) diff --git a/peft/examples/xlora/README.md b/peft/examples/xlora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ae81cfcd45e5940262504a95dc4805d45bdf280d --- /dev/null +++ b/peft/examples/xlora/README.md @@ -0,0 +1,15 @@ +# X-LoRA examples + +## `xlora_inference_mistralrs.py` + +Perform inference of an X-LoRA model using the inference engine mistral.rs. + +Mistral.rs supports many base models besides Mistral, and can load models directly from saved LoRA checkpoints. Check out [adapter model docs](https://github.com/EricLBuehler/mistral.rs/blob/master/docs/ADAPTER_MODELS.md) and the [models support matrix](https://github.com/EricLBuehler/mistral.rs?tab=readme-ov-file#support-matrix). + +Mistral.rs features X-LoRA support and incorporates techniques such as a dual-KV cache, continuous batching, Paged Attention, and optional non granular scalings, will allow vastly improved throughput. + +Links: + +- Installation: https://github.com/EricLBuehler/mistral.rs/blob/master/mistralrs-pyo3/README.md +- Runnable example: https://github.com/EricLBuehler/mistral.rs/blob/master/examples/python/xlora_zephyr.py +- Adapter model docs and making the ordering file: https://github.com/EricLBuehler/mistral.rs/blob/master/docs/ADAPTER_MODELS.md \ No newline at end of file diff --git a/peft/examples/xlora/xlora_inference_mistralrs.py b/peft/examples/xlora/xlora_inference_mistralrs.py new file mode 100644 index 0000000000000000000000000000000000000000..8a3f4de4261ea5613aa3acbfc12930bc89f1344b --- /dev/null +++ b/peft/examples/xlora/xlora_inference_mistralrs.py @@ -0,0 +1,25 @@ +from mistralrs import ChatCompletionRequest, Runner, Which + + +runner = Runner( + which=Which.XLora( + tok_model_id=None, # Automatically determine from ordering file + model_id=..., # Model ID of the base model (local path of HF model ID) + xlora_model_id=..., # X-LoRA Model ID of the base model (local path of HF model ID) + order=..., # Ordering file to ensure compatability with PEFT + tgt_non_granular_index=3, # Only generate scalings for the first 3 decoding tokens, and then use the last generated one + ) +) + +res = runner.send_chat_completion_request( + ChatCompletionRequest( + model="mistral", + messages=[{"role": "user", "content": "Tell me a story about 2 low rank matrices."}], + max_tokens=256, + presence_penalty=1.0, + top_p=0.1, + temperature=0.5, + ) +) +print(res.choices[0].message.content) +print(res.usage) diff --git a/peft/method_comparison/MetaMathQA/Makefile b/peft/method_comparison/MetaMathQA/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..47ce7ab90d5c75e52b57d73a34b35a9056229d83 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/Makefile @@ -0,0 +1,90 @@ +# Makefile for running MetaMathQA experiments. + +# --- Configuration --- +PYTHON := python +RUN_SCRIPT := run.py +EXPERIMENTS_DIR := experiments +RESULTS_DIR := results + +# --- Automatic Experiment and Result Discovery --- + +# 1. Find all experiment directories by looking for adapter_config.json files. +# This gives us a list like: experiments/lora/llama-3.2-3B-rank32 ... +EXPERIMENT_PATHS := $(shell find $(EXPERIMENTS_DIR) \ + -name "adapter_config.json" -or \ + -name "training_params.json" | xargs dirname | sort -u) + +# 2. Define a function to replace all occurrences of a character in a string. +# This is needed to replicate the result naming logic from run.py (e.g., "lora/foo" -> "lora-foo"). +# Usage: $(call replace-all, string, char_to_replace, replacement_char) +replace-all = $(if $(findstring $(2),$(1)),$(call replace-all,$(subst $(2),$(3),$(1)),$(2),$(3)),$(1)) + +# 3. Define a function to convert an experiment path to its flat result file path. +# e.g., "experiments/lora/llama-3.2-3B-rank32" -> "results/lora-llama-3.2-3B-rank32.json" +exp_to_res = $(RESULTS_DIR)/$(call replace-all,$(patsubst $(EXPERIMENTS_DIR)/%,%,$(1)),/,--).json + +# 4. Generate the list of all target result files we want to build. +RESULT_FILES := $(foreach exp,$(EXPERIMENT_PATHS),$(call exp_to_res,$(exp))) + + +# --- Main Rules --- + +# The default 'all' target depends on all possible result files. +# Running `make` or `make all` will check and run any outdated or missing experiments. +all: $(RESULT_FILES) + + +# --- Dynamic Rule Generation --- + +# This is the core logic. We dynamically generate a specific Makefile rule for each experiment found. +# This avoids a complex pattern rule and makes the logic clearer. +define EXPERIMENT_template +# Input $1: The full experiment path (e.g., experiments/lora/llama-3.2-3B-rank32) + +# Define the rule: +# The target is the result file (e.g., results/lora-llama-3.2-3B-rank32.json). +# The dependencies are its config files, code changes need to be audited manually since they can +# vary in degree of importance. Note that we explicitly ignore when the script fails to run +# so that the other experiments still have a chance to run. +$(call exp_to_res,$(1)): $(wildcard $(1)/adapter_config.json) $(wildcard $(1)/training_params.json) + @echo "---" + @echo "Running experiment: $(1)" + -$(PYTHON) $(RUN_SCRIPT) -v $(1) + @echo "Finished: $$@" + @echo "---" + +endef + +# This command iterates through every found experiment path and evaluates the template, +# effectively stamping out a unique, explicit rule for each one. +$(foreach exp_path,$(EXPERIMENT_PATHS),$(eval $(call EXPERIMENT_template,$(exp_path)))) + + +# --- Utility Rules --- + +.PHONY: all clean list dump_rules + +# The 'clean' rule removes all generated results. +clean: + @echo "Cleaning results directory..." + @([ -n "$(wildcard $(RESULTS_DIR)/*.json)" ] && rm $(RESULTS_DIR)/*.json) || exit 0 + +# The 'list' rule is for debugging. It shows the discovered experiments +# and the result files the Makefile expects to create for them. +list: + @echo "Discovered experiment configurations:" + @$(foreach exp,$(EXPERIMENT_PATHS),echo " - $(exp)/adapter_config.json";) + @echo "\nTarget result files:" + @$(foreach res,$(RESULT_FILES),echo " - $(res)";) + +# The 'dump_rules' rule is for debugging. It dumps all dynamically defined rules. +define newline + + +endef +define DUMPED_RULES + $(foreach exp_path,$(EXPERIMENT_PATHS),$(call EXPERIMENT_template,$(exp_path))) +endef + +dump_rules: + @echo -e "$(subst $(newline),\n,${DUMPED_RULES})" diff --git a/peft/method_comparison/MetaMathQA/README.md b/peft/method_comparison/MetaMathQA/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4c762406619c56be43fdc0e93c6b6e4dbef02e58 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/README.md @@ -0,0 +1,241 @@ +# PEFT method comparison on the MetaMathQA and GSM8K datasets + +## Goal + +This goal is to provide a benchmarking framework for the different PEFT methods that are implemented. It is important that evaluating different PEFT methods is reproducible, idempotent, and version-controlled. Results for more PEFT methods can be added over time. + +## Dataset + +This task trains on the [MetaMathQA]((https://huggingface.co/datasets/meta-math/MetaMathQA)) dataset and validates/tests on the [GSM8K](https://huggingface.co/datasets/openai/gsm8k) dataset ("main"). + +For the model to attain good accuracy, it needs to learn to adhere to the output format and it must express basic chain of thought reasoning capabilities to get to the correct result in the first place. The task is challenging for models in the sub 7B parameter range. + +The train set uses the whole of MetaMathQA. The validation set is a random sample from the train set of GSM8K. The test set is the whole of the GSM8K test set. + +## Running + +Create an experiment in the `experiment/` folder of your choice and give it a name (the name itself does not matter but helps identify the experiment). An example would be `experiments/lora/llama-3.2-3B-rank32/`. Inside that directory, create 2 files: + +- `adapter_config.json` +- Optional: `training_parameters.json` + +Once you created these two files, you can either + +- run the whole suite using by simply calling `make` (takes >24h) +- run one specific experiment by calling `make results/-.json`, + for example `results/vblora-llama-3.2-3B-default.json` + +You can get a list of all runnable experiments by running `make list`, e.g.: +``` +% make list (git)-[method-comparison-results] ⛓ peft +Discovered experiment configurations: + - experiments/ptuning/llama-3.2-3B-default/adapter_config.json + [...] + - experiments/vblora/llama-3.2-3B-default/adapter_config.json + +Target result files: + - results/ptuning-llama-3.2-3B-default.json + [...] + - results/vblora-llama-3.2-3B-default.json +``` + +In case you want to force the execution of an experiment, you can simply `touch` the respective adapter config +without modifying it. For example: + + touch experiments/vblora/llama-3.2-3B-default/adapter_config.json + make + +to run the VBLoRA default experiment again. + +### `adapter_config.json` + +This must be a valid PEFT configuration. It is easiest to create it programmatically, e.g.: + +```python +from peft import LoraConfig + +config = LoraConfig(...) +config.save_pretrained() +``` + +### `training_parameters.json` + +There is a default file for the non-PEFT parameters: `default_training_params.json`. This contains all the other parameters that are relevant for training, e.g. the base model id, number of steps, batch size, learning rate, etc. If parameters that differ from the defaults are needed for a specific experiment, place a `training_parameters.json` into the experiment directory and adjust the parameters that need changing. The other parametes are taken from the aforementioned default config. + +For an overview of all possible arguments, you can also check the `TrainConfig` `dataclass` in `utils.py`. + +### Runtime performance + +Several factors should be considered to achieve a fast runtime performance. Besides the obvious factors like `max_steps` or the base model size, we found the following factors to have a significant impact: + +#### Eval batch size + +Regarding the `batch_size_eval` parameter, it is quite critical since evaluation takes up a significant portion of the training time and batching helps with reducing that. It should be possible to choose a value that is multiple times higher than the batch size used for training (`batch_size`). You should also pay attention to the size of the validation set -- e.g. if it's 50, don't choose a `batch_size_eval` of 40, as that results in a large batch of 30 and a small batch of 10. 25 might be a better choice. Also, ensure via a quick train run that the batch size does not lead to out of memory errors -- getting this error at the very end on evaluating the test set would be quite a loss of time. + +#### Generation length + +During testing, we discovered that the validation time is greatly inflated by just a few very long generations. Those can inflate the validation time by a factor of 3 or more. At the same time, we discovered that these long generations do not help with accuracy -- in fact, if they exceed the maximum configured length, they're just cut off mid sentence and would thus produce an accuracy of 0 anyway. + +To remedy this, we now set both `max_length` and `max_new_tokens` for the generation kwargs in the default training parameters. Normally, this is not possible when using transformers, as the latter argument overrides the former. However, we have added special logic inside of `get_generation_config` which takes both and chooses the smaller of the two. This way, we can get rid of these excessively long generations, thus considerably reducing eval times, while still guaranteeing a maximum total generation length to guard against OOM errors. Testing showed that this does not hamper test accuracy. It is therefore recommended not to change these settings. + +#### Bucketing + +The length of the sequences in the training data can vary a lot. Therefore, if samples are taken randomly from the training dataset, we will end up with batches containing very short and very long sequences. This is bad because the batch will be padded to the longest sequence, slowing down training. The obvious solution would be to sort the whole dataset by sequence length, but this is also bad because it introduces an order bias (e.g. first training on only short and then on only long answers). + +The solution is to find a trade off between the two factors. This is achieved by the `BucketIterator`. It first creates buckets that contain multiple batches, e.g. 20x the batch size. The bucket is then sorted by sequence length and then batches are yielded from the bucket. Therefore, we have a small order bias within a bucket but not between buckets, stricking a good balance between training speed and training loss. + +From practical experiments, for a batch size of 4, a bucket size of 80 provides a good balance with only slightly lower training loss but cutting training time by 25%. For eval, we don't use the iterator since there, the batch size is relatively big and thus there is little upside. + +### Start a run + +Once everything is set up properly, start a run by using the `run.py` script. Pass `-v` for verbose output to the console (recommended if observing the progress is desired). As an example, for `experiments/lora/llama-3.2-3B-rank32/` the invocation would be: + +```sh +python run.py -v experiments/lora/llama-3.2-3B-rank32/ +``` + +By default, the adapter will be saved in a temporary file for further inspection if needed. The prevent this, add the `--clean` flag to the call. + +### Run status + +The run can be categorized 3 different states: + +1. Main run: You are on the `main` branch and the run ended successfully. The results are stored in the `results` folder and are used for further analysis. +2. Test run: You are not on the `main` branch and the run ended successfully. The results are stored in the `temporary_results` folder and are not used for further analysis. +3. The run was cancelled (`ctrl + c`). The results are stored in the `cancelled_results` folder and are not used for further analysis. + +## Outputs + +Results are stored in one of the result directories. An example output could look like so: + +```js +{ + "run_info": { + "created_at": "2025-03-05T13:50:05+00:00", + "total_time": 2711.0915009640157, + "experiment_name": "ia3/lr_0.001", + "peft_branch": "ben-method-comparison", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 51, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "generation_kwargs": { + "max_length": 800 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "v_proj", + "k_proj", + "down_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + } + }, + "train_info": { + "accelerator_memory_reserved_avg": 14229219940, + "accelerator_memory_max": 24847056896, + "accelerator_memory_reserved_99th": 19115624366, + "train_time": 2238.65277833899, + "file_size": 1157064, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0784313725490196, + "train loss": 1.1336498007774354, + "train samples": 1000 + }, + [...] + { + "step": 5000, + "valid accuracy": 0.21568627450980393, + "train loss": 0.6345920492410659, + "train samples": 20000 + }, + { + "step": 5000, + "test accuracy": 0.35129740518962077, + "train loss": 0.6345920492410659, + "train samples": 20000, + "train total tokens": 4197579 + } + ] + }, + "meta_info": { + "model_sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "model_created_at": "2024-09-18T15:23:48+00:00", + "dataset_sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "dataset_created_at": "2023-09-21T17:22:46+00:00", + "package_info": { + "transformers-version": "4.50.0.dev0", + "transformers-commit-hash": "752ef3fd4e70869626ec70657a770a85c0ad9219", + "peft-version": "0.14.1.dev0", + "peft-commit-hash": "a447a4e5ecd87b7d57733f4df9616a328cf130f4", + "datasets-version": "3.3.2", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.45.2", + "bitsandbytes-commit-hash": null, + "torch-version": "2.6.0+cu124", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.11.0-17-generic", + "version": "#17~24.04.2-Ubuntu SMP PREEMPT_DYNAMIC Mon Jan 20 22:48:29 UTC 2", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA GeForce RTX 4090" + }, + "pytorch_info": "PyTorch built with: [...]" + } +} +``` + +## Dependencies + +Apart from the normal PEFT dependencies, ensure that the packages in the `requirements.txt` are installed, e.g. via: + +```sh +python -m pip install -r requirements.txt +``` + +Python 3.12+ is required. + +## Open tasks + +- consider using `DataLoader` +- consider adding https://github.com/huggingface/Math-Verify +- consider adding `weight` argument to cross entropy calculation to downweight the EOS token, but it would require calculating the loss manually instead of relying on transformers (see https://github.com/huggingface/transformers/blob/6a876462c308bd7cd7d3ca8e93abaa7d5b02e90e/src/transformers/loss/loss_utils.py#L24-L48) +- do a sanity check against/comparison with transformers Trainer +- consider using vLLM to potentially speed up generations, at least for the test set +- using `torch.compile` leads to a huge slowdown, investigate (maybe recompiles), although it does save memory +- AMP does not appear to help, investigate +- packing of sequences (but this probably requires adjusting the attention matrix) +- clean up what gets printed and where (stdout, stderr) diff --git a/peft/method_comparison/MetaMathQA/cancelled_results/.gitkeep b/peft/method_comparison/MetaMathQA/cancelled_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/MetaMathQA/data.py b/peft/method_comparison/MetaMathQA/data.py new file mode 100644 index 0000000000000000000000000000000000000000..be3ace83cfa83c211f5f41086fee9f36660363f7 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/data.py @@ -0,0 +1,109 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +All utilities related to data handling. +""" + +from functools import partial +from typing import Callable + +import datasets +import numpy as np +from datasets import Dataset, load_dataset + + +# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of +# the dataset +CHAR_LIMIT = 1300 +# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set, +# since it's run multiple times during training; test is only run once at the end and thus can be larger +VALID_SIZE = 50 + + +def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset: + """Return the filtered dataset, with long queries removed. + + We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is + a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each + model, but we want the same filter for each model. + + """ + char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])] + idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT] + print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset") + return ds.select(idx_filtered) + + +def get_train_valid_test_datasets( + *, tokenizer, query_template: str, print_fn: Callable[..., None] +) -> tuple[Dataset, Dataset, Dataset]: + """ + Return the indices of the train, valid, and test splits of the dataset. + + We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives: + + > ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value. + + even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead. + """ + metamath = load_dataset("meta-math/MetaMathQA")["train"] + metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn) + + # gsmk8k does not need to be filtered as query and response are short enough + gsm8k = load_dataset("openai/gsm8k", "main") + gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"}) + gsm8k_train = gsm8k["train"] + gsm8k_test = gsm8k["test"] + + np.random.seed(0) + indices = np.arange(len(gsm8k_train)) + np.random.shuffle(indices) + idx_valid = indices[:VALID_SIZE] + + ds_train = metamath + ds_valid = gsm8k_train.select(idx_valid) + ds_test = gsm8k_test + + print_fn(f"Train size: {len(ds_train)}") + print_fn(f"Valid size: {len(ds_valid)}") + print_fn(f"Test size: {len(ds_test)}") + + tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template) + tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template) + ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"]) + ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"]) + ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"]) + + return ds_train, ds_valid, ds_test + + +def tokenize_with_answer(samples, tokenizer, template): + queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])] + tokenized = tokenizer(queries) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized + + +def tokenize_wo_answer(samples, tokenizer, template): + queries = [template.format(query=sample) for sample in samples["query"]] + tokenized = tokenizer(queries) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized diff --git a/peft/method_comparison/MetaMathQA/default_training_params.json b/peft/method_comparison/MetaMathQA/default_training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..a200a41ed96409033011b7e4fc33e05fe9c61162 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/default_training_params.json @@ -0,0 +1,26 @@ +{ + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 1e-4, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "attn_implementation": null, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "query_template": "Question: {query} Think step by step.\nAnswer:" +} diff --git a/peft/method_comparison/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d20357b52d92ad65b3af6e932c9dd8d16b47bcb4 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/adalora/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "corda_config": null, + "deltaT": 1, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "init_r": 64, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 8, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": null, + "target_r": 32, + "task_type": null, + "tfinal": 500, + "tinit": 200, + "total_step": 5000, + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..367bea4cf187d10c96a4d8b53f355bfd269a1e6a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/adapter_config.json @@ -0,0 +1,11 @@ +{ + "adapter_layers": 28, + "adapter_len": 100, + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "peft_type": "ADAPTION_PROMPT", + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json b/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..e8106a88d0de4099e2cbd2648abbe43bdebe6091 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/adaptionprompt/llama-3.2-3B-lr_0.0005/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 5e-4 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..44d50893ff7f19c1851a2150e879c444ec134fe1 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/boft/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "boft_block_num": 0, + "boft_block_size": 4, + "boft_dropout": 0.0, + "boft_n_butterfly_factor": 1, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BOFT", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cd69e4389edfe1738ceec6c42be177dd17d924c6 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-bat/adapter_config.json @@ -0,0 +1,19 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": "bat", + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BONE", + "r": 64, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..abc68802718821c659614b5fdeabb45db2df824b --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/bone/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,19 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "BONE", + "r": 64, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..170c4bb33e558339b07da8044fb3cb2093d2e4eb --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "block_size": 64, + "block_size_pattern": {}, + "peft_type": "C3A", + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} diff --git a/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json b/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..a39b9dc8a825e9b79ace91032d0755835548eb44 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/c3a/llama-3.2-3B-default/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 3e-1, + "weight_decay": 1e-5 + } +} diff --git a/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a379f07427f4c68eeaf06756004ddaa377f96b --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,23 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "n_frequency": 1000, + "n_frequency_pattern": {}, + "peft_type": "FOURIERFT", + "random_loc_seed": 777, + "revision": null, + "scaling": 300, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..40d40246c48487419ea0d21eb369bea60c729496 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/fourierft/llama-3.2-3B-n_frequency-5000/adapter_config.json @@ -0,0 +1,23 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": false, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "n_frequency": 5000, + "n_frequency_pattern": {}, + "peft_type": "FOURIERFT", + "random_loc_seed": 777, + "revision": null, + "scaling": 300, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..6d6c3b0f9114a63d0739eef0c996f4c1c0c0e36c --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/full-finetuning/llama-3.2-3B-lr_0.00001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-5 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8e514faa808ac0874e71f21bad7a576d15349d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,14 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "feedforward_modules": null, + "inference_mode": false, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0c8e514faa808ac0874e71f21bad7a576d15349d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,14 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "feedforward_modules": null, + "inference_mode": false, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/ia3/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70b7363d3ac83e8ff2ee85634baafbee1f42b56a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/ln_tuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,11 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "inference_mode": false, + "modules_to_save": null, + "peft_type": "LN_TUNING", + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1137259fa26b3abeac269c7bdda56dbeb29e34f7 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/loha/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,24 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOHA", + "r": 32, + "rank_dropout": 0.0, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null, + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7d30dd77a4c5f185fde99a5d60f381961ac7c522 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lokr/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,27 @@ +{ + "alpha": 64, + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "decompose_both": false, + "decompose_factor": -1, + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "peft_type": "LOKR", + "r": 32, + "rank_dropout": 0.0, + "rank_dropout_scale": false, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null, + "use_effective_conv2d": false +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..255d09d2508a603fd8eea98152025c6cd8f0a789 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-dora/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": true, + "use_rslora": false +} diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8832c108fac1825c52774517fd3e5bf0fc7d8d64 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..985db872405905c31c93a10aa9cd3f77ed223437 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32-lorafa/training_params.json @@ -0,0 +1,9 @@ +{ + "optimizer_type": "lora-fa", + "optimizer_kwargs": { + "r": 32, + "lora_alpha": 64, + "lr": 1e-4, + "weight_decay": 0.1 + } +} diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8832c108fac1825c52774517fd3e5bf0fc7d8d64 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dc1f8039ab02888675a12a1a1a017ebdd196b9d4 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64-rslora/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": true +} diff --git a/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..75890c9dce9fef14eee47ce19f3baa86d4d4168a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/lora/llama-3.2-3B-rank64/adapter_config.json @@ -0,0 +1,30 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 128, + "lora_bias": false, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": null, + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} diff --git a/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-bat/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-bat/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a3267da039aad22dcc2d70006dccc34c8759d359 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-bat/adapter_config.json @@ -0,0 +1,18 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": "bat", + "layers_pattern": null, + "layers_to_transform": null, + "mini_r": 1, + "miss_dropout": 0.0, + "modules_to_save": null, + "peft_type": "MISS", + "r": 64, + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..02ee9f74396d7372104f2adf9f462e1ac4653d93 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,18 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "mini_r": 1, + "miss_dropout": 0.0, + "modules_to_save": null, + "peft_type": "MISS", + "r": 64, + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-mini/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-mini/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d732e181b21e0961029723bc640c25d5917d217e --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/miss/llama-3.2-3B-mini/adapter_config.json @@ -0,0 +1,18 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "inference_mode": false, + "init_weights": "mini", + "layers_pattern": null, + "layers_to_transform": null, + "mini_r": 64, + "miss_dropout": 0.0, + "modules_to_save": null, + "peft_type": "MISS", + "r": 64, + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8cdb86ecd110a0176dd42b34e25b3c133cbab4a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/oft/llama-3.2-3B-rank32/adapter_config.json @@ -0,0 +1,27 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "block_share": false, + "coft": false, + "eps": 6e-05, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "module_dropout": 0.0, + "modules_to_save": null, + "oft_block_size": 0, + "peft_type": "OFT", + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..77bff7fd16cd3f675655221218e69a55eaead91f --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,15 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "encoder_hidden_size": 3072, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PREFIX_TUNING", + "prefix_projection": false, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072 +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prefixtuning/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..efa055b03d6f3a6c6d0f7df76f11550891919b0a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072, + "tokenizer_kwargs": null, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..efa055b03d6f3a6c6d0f7df76f11550891919b0a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072, + "tokenizer_kwargs": null, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b9f14279798195477b8a188feb12dfcd9605e00 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 200, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "SAMPLE_VOCAB", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072, + "tokenizer_kwargs": null, + "tokenizer_name_or_path": null +} diff --git a/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/prompt_tuning/llama-3.2-3B-sample_vocab-lr_0.001/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..54469edf776f3de255d054c317887b1312aa7791 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/ptuning/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,17 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "encoder_dropout": 0.0, + "encoder_hidden_size": 3072, + "encoder_num_layers": 2, + "encoder_reparameterization_type": "MLP", + "inference_mode": false, + "num_attention_heads": 24, + "num_layers": 28, + "num_transformer_submodules": 1, + "num_virtual_tokens": 20, + "peft_type": "P_TUNING", + "revision": null, + "task_type": "CAUSAL_LM", + "token_dim": 3072 +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3dbdfaa6b123a057774dc5c46d86bfe4d4e35b55 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/randlora/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "RANDLORA", + "projection_prng_key": 0, + "r": 32, + "randlora_alpha": 640, + "randlora_dropout": 0.0, + "revision": null, + "save_projection": true, + "sparse": false, + "target_modules": null, + "task_type": null, + "very_sparse": false +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d0f74c40761246f57831661e4a86dae1d28399d8 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/adapter_config.json @@ -0,0 +1,12 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "group_size": 64, + "inference_mode": false, + "init_weights": true, + "peft_type": "ROAD", + "revision": null, + "target_modules": null, + "task_type": null, + "variant": "road_2" +} diff --git a/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/training_params.json b/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..52d87e3ef6d143c29d2ba640028909a31befffa6 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/road/llama-3.2-3B-lr_0.001/training_params.json @@ -0,0 +1,5 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} diff --git a/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0d4565eef10746dd8c02364f737fb7a7143955d0 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/adapter_config.json @@ -0,0 +1,15 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "mask_type": "random", + "modules_to_save": null, + "peft_type": "SHIRA", + "r": 32, + "random_seed": 42, + "revision": null, + "target_modules": null, + "task_type": null +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/training_params.json b/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..92f18b8a3c5adb57d1d25314d50f1c8df85eb570 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/shira/llama-3.2-3B-lr_0.0003-random_seed_42/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 3e-4 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bce0cd5b129fe4281eb46c7d17f05bb0fca3935d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/adapter_config.json @@ -0,0 +1,7 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "peft_type": "TRAINABLE_TOKENS", + "token_indices": [128000, 128001], + "task_type": "CAUSAL_LM" +} diff --git a/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/training_params.json b/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..92e546e6cfeadce2db2ab6dbd124790a4fb0dbf4 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/trainable_tokens/llama-3.2-3B-sos+eos/training_params.json @@ -0,0 +1,5 @@ +{ + "optimizer_kwargs": { + "lr": 0.2 + } +} diff --git a/peft/method_comparison/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b6cbc59e57c07e6b883ff34ed98090d51916d652 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/vblora/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_logits_std": 0.1, + "init_vector_bank_bound": 0.02, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "num_vectors": 256, + "peft_type": "VBLORA", + "r": 4, + "revision": null, + "save_only_topk_weights": false, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": null, + "topk": 2, + "vblora_dropout": 0.0, + "vector_length": 256 +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f4962c1b4fa1266ba29f31559fa3260483d8fac7 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/adapter_config.json @@ -0,0 +1,20 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "d_initial": 0.1, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "peft_type": "VERA", + "projection_prng_key": 0, + "r": 256, + "revision": null, + "save_projection": true, + "target_modules": null, + "task_type": null, + "vera_dropout": 0.0 +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json b/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json new file mode 100644 index 0000000000000000000000000000000000000000..8a120ad9a80c36dc3666f4da481a5292a7dc8072 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/vera/llama-3.2-3B-default/training_params.json @@ -0,0 +1,6 @@ +{ + "optimizer_kwargs": { + "lr": 1e-3 + } +} + diff --git a/peft/method_comparison/MetaMathQA/experiments/waveft/llama-3.2-3B-n_frequency-5000/adapter_config.json b/peft/method_comparison/MetaMathQA/experiments/waveft/llama-3.2-3B-n_frequency-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aad104d0f1dec10bf97eee02c9d22b2b49dbeb2 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/experiments/waveft/llama-3.2-3B-n_frequency-5000/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": null, + "bias": "none", + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": false, + "init_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "modules_to_save": null, + "n_frequency": 5000, + "n_frequency_pattern": {}, + "peft_type": "WAVEFT", + "proportional_parameters": false, + "random_loc_seed": 777, + "revision": null, + "scaling": 25.0, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_idwt": true, + "wavelet_family": "db1" +} diff --git a/peft/method_comparison/MetaMathQA/requirements.txt b/peft/method_comparison/MetaMathQA/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee25a1ced129a9c13938c22922aa0514230af60b --- /dev/null +++ b/peft/method_comparison/MetaMathQA/requirements.txt @@ -0,0 +1,4 @@ +bitsandbytes +datasets +numpy +tqdm diff --git a/peft/method_comparison/MetaMathQA/results/.gitkeep b/peft/method_comparison/MetaMathQA/results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json b/peft/method_comparison/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..8b38ea15e8a213a05e1872f928958e601ce60370 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/adalora--llama-3.2-3B-rank32.json @@ -0,0 +1,4071 @@ +{ + "run_info": { + "created_at": "2025-06-19T23:12:19+00:00", + "total_time": 2209.243281380004, + "experiment_name": "adalora/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "ADALORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 8, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": { + "model.layers.0.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.0.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.1.self_attn.q_proj.lora_E": [ + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false + ], + "model.layers.1.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.2.self_attn.q_proj.lora_E": [ + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false + ], + "model.layers.2.self_attn.v_proj.lora_E": [ + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true + ], + "model.layers.3.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.3.self_attn.v_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true + ], + "model.layers.4.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false + ], + "model.layers.4.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.5.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.5.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true + ], + "model.layers.6.self_attn.q_proj.lora_E": [ + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.6.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true + ], + "model.layers.7.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.7.self_attn.v_proj.lora_E": [ + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true + ], + "model.layers.8.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true + ], + "model.layers.8.self_attn.v_proj.lora_E": [ + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true + ], + "model.layers.9.self_attn.q_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + false, + true + ], + "model.layers.9.self_attn.v_proj.lora_E": [ + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false + ], + "model.layers.10.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.10.self_attn.v_proj.lora_E": [ + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true + ], + "model.layers.11.self_attn.q_proj.lora_E": [ + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + false, + true, + true, + false, + false, + false + ], + "model.layers.11.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false + ], + "model.layers.12.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.12.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + false + ], + "model.layers.13.self_attn.q_proj.lora_E": [ + true, + true, + false, + true, + true, + true, + false, + false, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + false, + false, + false, + true, + false + ], + "model.layers.13.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.14.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + true, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true, + false, + true, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + false, + true, + false, + true, + true, + false + ], + "model.layers.14.self_attn.v_proj.lora_E": [ + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false + ], + "model.layers.15.self_attn.q_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true + ], + "model.layers.15.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.16.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + false, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + false, + true, + true + ], + "model.layers.16.self_attn.v_proj.lora_E": [ + true, + false, + true, + false, + true, + false, + true, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + false + ], + "model.layers.17.self_attn.q_proj.lora_E": [ + true, + true, + true, + true, + false, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true + ], + "model.layers.17.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true + ], + "model.layers.18.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + false, + true, + false, + false, + false, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true + ], + "model.layers.18.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true + ], + "model.layers.19.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true + ], + "model.layers.19.self_attn.v_proj.lora_E": [ + false, + true, + false, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true + ], + "model.layers.20.self_attn.q_proj.lora_E": [ + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + false, + true, + false, + false + ], + "model.layers.20.self_attn.v_proj.lora_E": [ + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + true + ], + "model.layers.21.self_attn.q_proj.lora_E": [ + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true + ], + "model.layers.21.self_attn.v_proj.lora_E": [ + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + false, + false, + false, + false, + false, + true, + true, + true, + false, + true, + true, + false, + true, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false + ], + "model.layers.22.self_attn.q_proj.lora_E": [ + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + true, + false, + true + ], + "model.layers.22.self_attn.v_proj.lora_E": [ + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true + ], + "model.layers.23.self_attn.q_proj.lora_E": [ + true, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + false, + true, + true + ], + "model.layers.23.self_attn.v_proj.lora_E": [ + false, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + false, + false, + true, + true, + false, + true, + false, + true, + true + ], + "model.layers.24.self_attn.q_proj.lora_E": [ + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + false, + false, + true, + true, + true, + true, + false, + true, + true, + true, + true, + true, + true, + true, + true, + true, + false, + true, + true, + true, + false, + true, + true, + true, + true, + false, + true, + false, + true, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + true + ], + "model.layers.24.self_attn.v_proj.lora_E": [ + true, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + false, + false, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + false, + true, + false, + false, + true, + true, + false, + true, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + true, + false, + true, + false, + true, + true, + true + ], + "model.layers.25.self_attn.q_proj.lora_E": [ + false, + false, + false, + false, + true, + true, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.25.self_attn.v_proj.lora_E": [ + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + false, + true, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false + ], + "model.layers.26.self_attn.q_proj.lora_E": [ + true, + false, + false, + true, + false, + false, + false, + false, + false, + false, + true, + false, + true, + false, + true, + true, + true, + false, + false, + true, + true, + true, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + true, + false, + false, + true, + true, + true, + false, + true, + true, + true, + true + ], + "model.layers.26.self_attn.v_proj.lora_E": [ + false, + false, + true, + false, + true, + false, + false, + false, + true, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + true, + false, + true, + false, + true, + true, + false, + false, + false, + false, + false, + false, + false, + false, + false, + true, + false, + false, + false, + false, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + false, + true, + false, + false, + false, + true, + false, + false + ], + "model.layers.27.self_attn.q_proj.lora_E": [ + true, + false, + false, + true, + true, + false, + false, + true, + true, + false, + false, + false, + true, + true, + false, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + false, + false, + true, + true, + false, + false, + false, + false, + true, + true, + true, + false, + true, + false, + false, + false, + true, + false, + true, + true, + true, + false, + false, + false, + true, + true, + true, + true, + true, + false, + false, + false, + false, + true, + false, + false, + false, + true, + false + ], + "model.layers.27.self_attn.v_proj.lora_E": [ + false, + false, + true, + true, + true, + true, + true, + true, + true, + false, + false, + false, + true, + false, + false, + false, + true, + true, + false, + false, + false, + true, + false, + true, + true, + true, + true, + true, + false, + true, + true, + false, + true, + false, + true, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + false, + false, + true, + false, + true, + true, + true, + false, + false, + true, + false, + false, + true, + true, + true, + true, + true, + false, + true + ] + }, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false, + "target_r": 32, + "init_r": 64, + "tinit": 200, + "tfinal": 500, + "deltaT": 1, + "beta1": 0.85, + "beta2": 0.85, + "orth_reg_weight": 0.5, + "total_step": 5000 + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12361399900, + "accelerator_memory_max": 22793945088, + "accelerator_memory_reserved_99th": 18203426160, + "train_time": 1986.3603882369862, + "file_size": 35147440, + "num_trainable_params": 18353664, + "num_total_params": 3231103544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3241184422969818, + "train samples": 1000, + "train time": 35.95594502204767, + "eval time": 11.413120707002236, + "tokens / sec": 5888.289123542072, + "mem allocated avg": 7292959393.792, + "mem reserved avg": 12441731727.36, + "elapsed time": 100.98083375500573 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 1.0195633232593537, + "train samples": 2000, + "train time": 37.64258231502754, + "eval time": 11.37802824100072, + "tokens / sec": 5525.524212428035, + "mem allocated avg": 7285510731.776, + "mem reserved avg": 12328493907.968, + "elapsed time": 197.93603045200143 + }, + { + "step": 750, + "valid accuracy": 0.28, + "train loss": 0.7883218789100647, + "train samples": 3000, + "train time": 37.909325722001086, + "eval time": 11.385932488003164, + "tokens / sec": 5655.626838954038, + "mem allocated avg": 7296095842.304, + "mem reserved avg": 12484438130.688, + "elapsed time": 295.9188707240028 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7408825470209122, + "train samples": 4000, + "train time": 37.79932949803333, + "eval time": 11.34964040399791, + "tokens / sec": 5511.6321576772825, + "mem allocated avg": 7286506670.08, + "mem reserved avg": 12351948455.936, + "elapsed time": 393.33776786700037 + }, + { + "step": 1250, + "valid accuracy": 0.36, + "train loss": 0.7282904219627381, + "train samples": 5000, + "train time": 37.475317073069164, + "eval time": 11.342822429993248, + "tokens / sec": 5564.676066473135, + "mem allocated avg": 7287005519.872, + "mem reserved avg": 12349910024.192, + "elapsed time": 490.5430299360014 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.7161256531476975, + "train samples": 6000, + "train time": 37.660518338059774, + "eval time": 11.34013032400253, + "tokens / sec": 5558.367469107556, + "mem allocated avg": 7287642494.976, + "mem reserved avg": 12380570386.432, + "elapsed time": 588.017992052999 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7056601424217224, + "train samples": 7000, + "train time": 37.636171496975294, + "eval time": 11.3171367870018, + "tokens / sec": 5562.600861695649, + "mem allocated avg": 7289782888.448, + "mem reserved avg": 12389051269.12, + "elapsed time": 685.2421731229988 + }, + { + "step": 2000, + "valid accuracy": 0.34, + "train loss": 0.7058932571411133, + "train samples": 8000, + "train time": 37.505602380944765, + "eval time": 11.37751964799827, + "tokens / sec": 5537.732680318789, + "mem allocated avg": 7287054886.912, + "mem reserved avg": 12336119152.64, + "elapsed time": 782.1823508529997 + }, + { + "step": 2250, + "valid accuracy": 0.3, + "train loss": 0.700018577337265, + "train samples": 9000, + "train time": 38.06487834800646, + "eval time": 11.33160761000181, + "tokens / sec": 5646.885247730137, + "mem allocated avg": 7297638139.904, + "mem reserved avg": 12521129902.08, + "elapsed time": 880.444039299 + }, + { + "step": 2500, + "valid accuracy": 0.34, + "train loss": 0.6984639673233032, + "train samples": 10000, + "train time": 37.400825600088865, + "eval time": 7.680036880999978, + "tokens / sec": 5507.017470745635, + "mem allocated avg": 7283608303.616, + "mem reserved avg": 12278598467.584, + "elapsed time": 973.4031999860017 + }, + { + "step": 2750, + "valid accuracy": 0.32, + "train loss": 0.691307947397232, + "train samples": 11000, + "train time": 37.97861938195274, + "eval time": 11.376824188999308, + "tokens / sec": 5578.954776346737, + "mem allocated avg": 7293332232.192, + "mem reserved avg": 12452821467.136, + "elapsed time": 1071.2981272770048 + }, + { + "step": 3000, + "valid accuracy": 0.3, + "train loss": 0.6851879090070725, + "train samples": 12000, + "train time": 37.862704559986014, + "eval time": 11.377599911000289, + "tokens / sec": 5512.839149387935, + "mem allocated avg": 7288929478.656, + "mem reserved avg": 12371468746.752, + "elapsed time": 1168.7257358770003 + }, + { + "step": 3250, + "valid accuracy": 0.34, + "train loss": 0.6939580011367797, + "train samples": 13000, + "train time": 37.79518606400961, + "eval time": 7.2029460159974406, + "tokens / sec": 5580.102176050141, + "mem allocated avg": 7290687285.248, + "mem reserved avg": 12403068633.088, + "elapsed time": 1261.9857917680056 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6825792235136032, + "train samples": 14000, + "train time": 37.73422463506722, + "eval time": 11.28984081800445, + "tokens / sec": 5558.614282617983, + "mem allocated avg": 7289277476.864, + "mem reserved avg": 12381820289.024, + "elapsed time": 1359.695578400002 + }, + { + "step": 3750, + "valid accuracy": 0.34, + "train loss": 0.6795008780956269, + "train samples": 15000, + "train time": 38.156728624038806, + "eval time": 11.362600938999094, + "tokens / sec": 5679.286663570962, + "mem allocated avg": 7299185600.512, + "mem reserved avg": 12562561236.992, + "elapsed time": 1458.6053942910003 + }, + { + "step": 4000, + "valid accuracy": 0.32, + "train loss": 0.6967895623445511, + "train samples": 16000, + "train time": 37.352128309052205, + "eval time": 11.363241717001074, + "tokens / sec": 5471.522219805362, + "mem allocated avg": 7281535514.624, + "mem reserved avg": 12256066666.496, + "elapsed time": 1555.2909630150025 + }, + { + "step": 4250, + "valid accuracy": 0.34, + "train loss": 0.6776066061258316, + "train samples": 17000, + "train time": 37.65609644694632, + "eval time": 11.334564828997827, + "tokens / sec": 5613.672683726684, + "mem allocated avg": 7291894349.824, + "mem reserved avg": 12418562392.064, + "elapsed time": 1652.928281804001 + }, + { + "step": 4500, + "valid accuracy": 0.34, + "train loss": 0.6868188911676407, + "train samples": 18000, + "train time": 37.48494880297949, + "eval time": 11.33762150000257, + "tokens / sec": 5544.038517760537, + "mem allocated avg": 7285549684.736, + "mem reserved avg": 12333837451.264, + "elapsed time": 1749.9311109990012 + }, + { + "step": 4750, + "valid accuracy": 0.34, + "train loss": 0.6806062284708023, + "train samples": 19000, + "train time": 33.62080936400889, + "eval time": 11.34113016500487, + "tokens / sec": 6244.31725384755, + "mem allocated avg": 7068488509.44, + "mem reserved avg": 12120833916.928, + "elapsed time": 1843.633759463999 + }, + { + "step": 5000, + "valid accuracy": 0.28, + "train loss": 0.6862971596717834, + "train samples": 20000, + "train time": 33.47089828590106, + "eval time": 11.363945298006001, + "tokens / sec": 6222.7191580255185, + "mem allocated avg": 7065409925.12, + "mem reserved avg": 12064965787.648, + "elapsed time": 1937.0431615920024 + }, + { + "step": 5000, + "test accuracy": 0.3904473085670963, + "train loss": 0.6862971596717834, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json b/peft/method_comparison/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json new file mode 100644 index 0000000000000000000000000000000000000000..c35ccc865b46ea711763fef8f6cabfd1d77d5bd8 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/adaptionprompt--llama-3.2-3B-lr_0.0005.json @@ -0,0 +1,341 @@ +{ + "run_info": { + "created_at": "2025-06-20T04:48:22+00:00", + "total_time": 2260.6744696069945, + "experiment_name": "adaptionprompt/llama-3.2-3B-lr_0.0005", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0005 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "ADAPTION_PROMPT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": "self_attn", + "adapter_len": 100, + "adapter_layers": 28 + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11893757234, + "accelerator_memory_max": 22410166272, + "accelerator_memory_reserved_99th": 17907664814, + "train_time": 1989.2834085189897, + "file_size": 17210384, + "num_trainable_params": 8601628, + "num_total_params": 3221351452, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3201356165409088, + "train samples": 1000, + "train time": 36.18721537806414, + "eval time": 13.46754032199533, + "tokens / sec": 5850.657415556191, + "mem allocated avg": 6848060076.032, + "mem reserved avg": 11943163199.488, + "elapsed time": 99.94861951399798 + }, + { + "step": 500, + "valid accuracy": 0.1, + "train loss": 1.153662922859192, + "train samples": 2000, + "train time": 35.6493088029747, + "eval time": 13.314302301005227, + "tokens / sec": 5834.474972559473, + "mem allocated avg": 6840933136.384, + "mem reserved avg": 11833045942.272, + "elapsed time": 193.4177081749949 + }, + { + "step": 750, + "valid accuracy": 0.22, + "train loss": 0.9016587936878204, + "train samples": 3000, + "train time": 36.424757257977035, + "eval time": 13.392894379001518, + "tokens / sec": 5886.133941305707, + "mem allocated avg": 6851972698.112, + "mem reserved avg": 11989870968.832, + "elapsed time": 288.2962625699947 + }, + { + "step": 1000, + "valid accuracy": 0.2, + "train loss": 0.8571369113922119, + "train samples": 4000, + "train time": 35.59983186099271, + "eval time": 13.363479856001504, + "tokens / sec": 5852.1624712581015, + "mem allocated avg": 6842572642.304, + "mem reserved avg": 11863001661.44, + "elapsed time": 381.66334240599826 + }, + { + "step": 1250, + "valid accuracy": 0.18, + "train loss": 0.84929132604599, + "train samples": 5000, + "train time": 35.52914607799903, + "eval time": 13.408120855005109, + "tokens / sec": 5869.490911551474, + "mem allocated avg": 6843078866.944, + "mem reserved avg": 11855409971.2, + "elapsed time": 475.2031378399988 + }, + { + "step": 1500, + "valid accuracy": 0.18, + "train loss": 0.8379741818904877, + "train samples": 6000, + "train time": 35.84657208897261, + "eval time": 13.451748254003178, + "tokens / sec": 5839.637873335062, + "mem allocated avg": 6844234328.064, + "mem reserved avg": 11880013758.464, + "elapsed time": 568.970056428996 + }, + { + "step": 1750, + "valid accuracy": 0.2, + "train loss": 0.8320568509101868, + "train samples": 7000, + "train time": 36.04748217701126, + "eval time": 13.354637482996623, + "tokens / sec": 5807.756529900249, + "mem allocated avg": 6845049858.048, + "mem reserved avg": 11894333112.32, + "elapsed time": 663.2131869919976 + }, + { + "step": 2000, + "valid accuracy": 0.2, + "train loss": 0.83651398563385, + "train samples": 8000, + "train time": 35.70882848704787, + "eval time": 13.407459709997056, + "tokens / sec": 5816.376756110452, + "mem allocated avg": 6842067818.496, + "mem reserved avg": 11843724640.256, + "elapsed time": 756.9679808469955 + }, + { + "step": 2250, + "valid accuracy": 0.18, + "train loss": 0.8321560187339783, + "train samples": 9000, + "train time": 36.077689886013104, + "eval time": 13.313609958000598, + "tokens / sec": 5957.92027369615, + "mem allocated avg": 6853360060.416, + "mem reserved avg": 12025841319.936, + "elapsed time": 851.5264306229947 + }, + { + "step": 2500, + "valid accuracy": 0.22, + "train loss": 0.830465945482254, + "train samples": 10000, + "train time": 35.51607862501987, + "eval time": 13.570960901000944, + "tokens / sec": 5799.260728488849, + "mem allocated avg": 6838232895.488, + "mem reserved avg": 11785499312.128, + "elapsed time": 945.1205676109967 + }, + { + "step": 2750, + "valid accuracy": 0.2, + "train loss": 0.8323929319381714, + "train samples": 11000, + "train time": 36.33290277811466, + "eval time": 13.340032396001334, + "tokens / sec": 5831.6562619276265, + "mem allocated avg": 6849506107.392, + "mem reserved avg": 11957667102.72, + "elapsed time": 1039.698461469001 + }, + { + "step": 3000, + "valid accuracy": 0.22, + "train loss": 0.8273163681030273, + "train samples": 12000, + "train time": 36.133581758025684, + "eval time": 13.486512909999874, + "tokens / sec": 5776.648476140576, + "mem allocated avg": 6844330549.248, + "mem reserved avg": 11874754101.248, + "elapsed time": 1134.0729920019949 + }, + { + "step": 3250, + "valid accuracy": 0.18, + "train loss": 0.8321007430553437, + "train samples": 13000, + "train time": 35.81564853595046, + "eval time": 13.383609317002993, + "tokens / sec": 5888.515456820645, + "mem allocated avg": 6845503963.136, + "mem reserved avg": 11903065653.248, + "elapsed time": 1228.1345331240009 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 0.8267617487907409, + "train samples": 14000, + "train time": 35.759473790014454, + "eval time": 13.568141147006827, + "tokens / sec": 5865.578482269809, + "mem allocated avg": 6844375582.72, + "mem reserved avg": 11893385199.616, + "elapsed time": 1322.3741278140005 + }, + { + "step": 3750, + "valid accuracy": 0.18, + "train loss": 0.822540352344513, + "train samples": 15000, + "train time": 36.6447854490616, + "eval time": 13.383382205000089, + "tokens / sec": 5913.610827418539, + "mem allocated avg": 6855454945.28, + "mem reserved avg": 12064244367.36, + "elapsed time": 1417.8726171529997 + }, + { + "step": 4000, + "valid accuracy": 0.22, + "train loss": 0.842738341331482, + "train samples": 16000, + "train time": 35.83419257100468, + "eval time": 13.484180120998644, + "tokens / sec": 5703.295800373884, + "mem allocated avg": 6837201041.408, + "mem reserved avg": 11769015697.408, + "elapsed time": 1511.8286734409994 + }, + { + "step": 4250, + "valid accuracy": 0.24, + "train loss": 0.8195172207355499, + "train samples": 17000, + "train time": 36.032976000991766, + "eval time": 13.43221827600064, + "tokens / sec": 5866.542913196561, + "mem allocated avg": 6847173238.784, + "mem reserved avg": 11924070727.68, + "elapsed time": 1606.2413196950001 + }, + { + "step": 4500, + "valid accuracy": 0.22, + "train loss": 0.8333091423511505, + "train samples": 18000, + "train time": 35.92476197002543, + "eval time": 13.364069708994066, + "tokens / sec": 5784.812163081199, + "mem allocated avg": 6842308513.792, + "mem reserved avg": 11840637632.512, + "elapsed time": 1700.1633438569988 + }, + { + "step": 4750, + "valid accuracy": 0.24, + "train loss": 0.8247289218902588, + "train samples": 19000, + "train time": 36.319470202004595, + "eval time": 13.367499373998726, + "tokens / sec": 5780.343128144329, + "mem allocated avg": 6845010323.456, + "mem reserved avg": 11893443919.872, + "elapsed time": 1795.0117048679967 + }, + { + "step": 5000, + "valid accuracy": 0.24, + "train loss": 0.8317011270523071, + "train samples": 20000, + "train time": 35.778475134953624, + "eval time": 13.382634160996531, + "tokens / sec": 5821.377216731123, + "mem allocated avg": 6841479706.624, + "mem reserved avg": 11840956399.616, + "elapsed time": 1888.9356832179983 + }, + { + "step": 5000, + "test accuracy": 0.22062168309325247, + "train loss": 0.8317011270523071, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/boft--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/boft--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..83ddbc7c63224c54a14f3d4eb0efdaabb159fb66 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/boft--llama-3.2-3B-default.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T00:26:06+00:00", + "total_time": 11113.556226242006, + "experiment_name": "boft/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BOFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "boft_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 14814855089, + "accelerator_memory_max": 24427626496, + "accelerator_memory_reserved_99th": 20103445872, + "train_time": 8291.859631775995, + "file_size": 3225360, + "num_trainable_params": 802816, + "num_total_params": 3213552640, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.291453486919403, + "train samples": 1000, + "train time": 168.6401632970519, + "eval time": 140.71104099299555, + "tokens / sec": 1255.4482625059293, + "mem allocated avg": 6794374191.104, + "mem reserved avg": 14862272954.368, + "elapsed time": 378.35506656600046 + }, + { + "step": 500, + "valid accuracy": 0.12, + "train loss": 1.0658165102005004, + "train samples": 2000, + "train time": 168.0782826189752, + "eval time": 140.55351014900225, + "tokens / sec": 1237.4888460248842, + "mem allocated avg": 6786098696.192, + "mem reserved avg": 14759126630.4, + "elapsed time": 750.4153373740046 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.8760707340240479, + "train samples": 3000, + "train time": 168.35559053501493, + "eval time": 140.5371915020005, + "tokens / sec": 1273.5009233649919, + "mem allocated avg": 6796379451.392, + "mem reserved avg": 14898109087.744, + "elapsed time": 1123.1088362480004 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.8187176239490509, + "train samples": 4000, + "train time": 168.23626853094902, + "eval time": 140.51234973900137, + "tokens / sec": 1238.3536666570453, + "mem allocated avg": 6788017170.432, + "mem reserved avg": 14785978564.608, + "elapsed time": 1495.2035204040003 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.7968595073223114, + "train samples": 5000, + "train time": 168.06973706404096, + "eval time": 140.56398986800195, + "tokens / sec": 1240.7825682534333, + "mem allocated avg": 6786994073.6, + "mem reserved avg": 14784728662.016, + "elapsed time": 1867.293767313 + }, + { + "step": 1500, + "valid accuracy": 0.3, + "train loss": 0.7768308148384094, + "train samples": 6000, + "train time": 168.12391281103191, + "eval time": 140.47015122300218, + "tokens / sec": 1245.0995013141533, + "mem allocated avg": 6790023022.592, + "mem reserved avg": 14800616685.568, + "elapsed time": 2239.2391544300044 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7639130955934524, + "train samples": 7000, + "train time": 168.4569528100401, + "eval time": 140.76006173399946, + "tokens / sec": 1242.780404772479, + "mem allocated avg": 6790166409.216, + "mem reserved avg": 14820103421.952, + "elapsed time": 2611.854956449002 + }, + { + "step": 2000, + "valid accuracy": 0.28, + "train loss": 0.7575103138685226, + "train samples": 8000, + "train time": 168.38565446306166, + "eval time": 140.82750502999988, + "tokens / sec": 1233.4542432506432, + "mem allocated avg": 6787659706.368, + "mem reserved avg": 14766038843.392, + "elapsed time": 2984.338527646003 + }, + { + "step": 2250, + "valid accuracy": 0.36, + "train loss": 0.7480558000802994, + "train samples": 9000, + "train time": 168.98983921804756, + "eval time": 140.92262020800263, + "tokens / sec": 1271.9581307054364, + "mem allocated avg": 6798715979.776, + "mem reserved avg": 14937929809.92, + "elapsed time": 3357.8442202950027 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.7452825582027436, + "train samples": 10000, + "train time": 168.30827127001976, + "eval time": 140.89225408899802, + "tokens / sec": 1223.7485326527044, + "mem allocated avg": 6783722676.224, + "mem reserved avg": 14710111993.856, + "elapsed time": 3730.0927005050034 + }, + { + "step": 2750, + "valid accuracy": 0.4, + "train loss": 0.7368131847381592, + "train samples": 11000, + "train time": 168.8352410539519, + "eval time": 140.97951381299936, + "tokens / sec": 1254.9571918595636, + "mem allocated avg": 6794155292.672, + "mem reserved avg": 14876869132.288, + "elapsed time": 4103.762088249001 + }, + { + "step": 3000, + "valid accuracy": 0.38, + "train loss": 0.7284122853279114, + "train samples": 12000, + "train time": 168.7332625999261, + "eval time": 140.92822863799665, + "tokens / sec": 1237.0471404616308, + "mem allocated avg": 6789107718.144, + "mem reserved avg": 14802571231.232, + "elapsed time": 4477.013831755001 + }, + { + "step": 3250, + "valid accuracy": 0.34, + "train loss": 0.7360657904148101, + "train samples": 13000, + "train time": 168.6564349730761, + "eval time": 140.91345744199498, + "tokens / sec": 1250.4770424779092, + "mem allocated avg": 6791307786.24, + "mem reserved avg": 14825665069.056, + "elapsed time": 4850.336532419002 + }, + { + "step": 3500, + "valid accuracy": 0.34, + "train loss": 0.7245372575521469, + "train samples": 14000, + "train time": 168.69712368501496, + "eval time": 141.10813598799723, + "tokens / sec": 1243.3525564528145, + "mem allocated avg": 6789542191.104, + "mem reserved avg": 14803175211.008, + "elapsed time": 5223.900597244006 + }, + { + "step": 3750, + "valid accuracy": 0.36, + "train loss": 0.7196882257461548, + "train samples": 15000, + "train time": 169.02741387199057, + "eval time": 140.85168583100312, + "tokens / sec": 1282.0583066135978, + "mem allocated avg": 6800711397.376, + "mem reserved avg": 14974772576.256, + "elapsed time": 5597.923287113001 + }, + { + "step": 4000, + "valid accuracy": 0.4, + "train loss": 0.7386573747396469, + "train samples": 16000, + "train time": 168.47688378201565, + "eval time": 141.17620621900278, + "tokens / sec": 1213.062560347618, + "mem allocated avg": 6781920968.704, + "mem reserved avg": 14703241723.904, + "elapsed time": 5970.573302798002 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.7167660998106002, + "train samples": 17000, + "train time": 168.66243355697225, + "eval time": 141.03309625500697, + "tokens / sec": 1253.3259217358275, + "mem allocated avg": 6792739334.144, + "mem reserved avg": 14838457696.256, + "elapsed time": 6343.574297415005 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.7278824989795685, + "train samples": 18000, + "train time": 168.825120675996, + "eval time": 141.10180295899772, + "tokens / sec": 1230.966097745832, + "mem allocated avg": 6787403542.528, + "mem reserved avg": 14768026943.488, + "elapsed time": 6716.868663600006 + }, + { + "step": 4750, + "valid accuracy": 0.34, + "train loss": 0.7206774606704712, + "train samples": 19000, + "train time": 168.64492384497134, + "eval time": 140.88104952100548, + "tokens / sec": 1244.8581031290848, + "mem allocated avg": 6790186668.032, + "mem reserved avg": 14817972715.52, + "elapsed time": 7090.485984892002 + }, + { + "step": 5000, + "valid accuracy": 0.34, + "train loss": 0.7268091850280761, + "train samples": 20000, + "train time": 168.56219975605927, + "eval time": 140.98389447200316, + "tokens / sec": 1235.6269691628356, + "mem allocated avg": 6787183779.84, + "mem reserved avg": 14761332834.304, + "elapsed time": 7463.428281595006 + }, + { + "step": 5000, + "test accuracy": 0.3646702047005307, + "train loss": 0.7268091850280761, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-bat.json b/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-bat.json new file mode 100644 index 0000000000000000000000000000000000000000..069bbfe1077efd4cc4ea365424fc6d6d3c554ff7 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-bat.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-20T03:31:24+00:00", + "total_time": 2742.3845372959986, + "experiment_name": "bone/llama-3.2-3B-bat", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BONE", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "init_weights": "bat", + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 14713983755, + "accelerator_memory_max": 25251807232, + "accelerator_memory_reserved_99th": 20472733368, + "train_time": 2430.7548372539895, + "file_size": 29367552, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.8741071329116822, + "train samples": 1000, + "train time": 44.769113782072964, + "eval time": 16.53786130100343, + "tokens / sec": 4729.130914464948, + "mem allocated avg": 6898425409.536, + "mem reserved avg": 14773294989.312, + "elapsed time": 124.73039968500234 + }, + { + "step": 500, + "valid accuracy": 0.42, + "train loss": 0.6946564470529556, + "train samples": 2000, + "train time": 43.747789238033874, + "eval time": 16.4541177170031, + "tokens / sec": 4754.4116770858745, + "mem allocated avg": 6890118709.248, + "mem reserved avg": 14662749913.088, + "elapsed time": 242.48505929599924 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6668610339164733, + "train samples": 3000, + "train time": 44.788394879076805, + "eval time": 8.99262467600056, + "tokens / sec": 4786.9766393472355, + "mem allocated avg": 6900886024.192, + "mem reserved avg": 14820195696.64, + "elapsed time": 354.3122298879971 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6476555281877517, + "train samples": 4000, + "train time": 43.08444309095648, + "eval time": 14.581032188005338, + "tokens / sec": 4835.527282090601, + "mem allocated avg": 6892210176.0, + "mem reserved avg": 14677799075.84, + "elapsed time": 469.41999823199876 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6442477897405624, + "train samples": 5000, + "train time": 43.81069704208494, + "eval time": 16.504536090003967, + "tokens / sec": 4759.979048031958, + "mem allocated avg": 6892437598.208, + "mem reserved avg": 14675995525.12, + "elapsed time": 587.4669312400001 + }, + { + "step": 1500, + "valid accuracy": 0.48, + "train loss": 0.6370412122011184, + "train samples": 6000, + "train time": 44.041188616007275, + "eval time": 11.50742915799492, + "tokens / sec": 4753.07335197389, + "mem allocated avg": 6893869041.664, + "mem reserved avg": 14704349020.16, + "elapsed time": 700.887209352004 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.6277673766613007, + "train samples": 7000, + "train time": 44.32280573899334, + "eval time": 16.494074002999696, + "tokens / sec": 4723.414876595195, + "mem allocated avg": 6895170344.96, + "mem reserved avg": 14718215389.184, + "elapsed time": 819.4313268580008 + }, + { + "step": 2000, + "valid accuracy": 0.48, + "train loss": 0.6278820457458496, + "train samples": 8000, + "train time": 43.325528461049544, + "eval time": 16.452074027998606, + "tokens / sec": 4793.848047040501, + "mem allocated avg": 6891568050.176, + "mem reserved avg": 14656710115.328, + "elapsed time": 936.9070930559974 + }, + { + "step": 2250, + "valid accuracy": 0.44, + "train loss": 0.6160005252361298, + "train samples": 9000, + "train time": 45.04456213898811, + "eval time": 16.52133422600309, + "tokens / sec": 4771.896757188206, + "mem allocated avg": 6903412344.832, + "mem reserved avg": 14851812360.192, + "elapsed time": 1056.8185863660037 + }, + { + "step": 2500, + "valid accuracy": 0.5, + "train loss": 0.6121727240085602, + "train samples": 10000, + "train time": 43.16439942702709, + "eval time": 16.356938169003115, + "tokens / sec": 4771.686916395162, + "mem allocated avg": 6888002562.048, + "mem reserved avg": 14598350569.472, + "elapsed time": 1173.7929829869972 + }, + { + "step": 2750, + "valid accuracy": 0.52, + "train loss": 0.6007345867156982, + "train samples": 11000, + "train time": 44.3066304440581, + "eval time": 16.514935120998416, + "tokens / sec": 4782.151065798665, + "mem allocated avg": 6899352545.28, + "mem reserved avg": 14785458470.912, + "elapsed time": 1292.7444534430033 + }, + { + "step": 3000, + "valid accuracy": 0.52, + "train loss": 0.5899704934358597, + "train samples": 12000, + "train time": 44.07467572299356, + "eval time": 16.412788394998643, + "tokens / sec": 4735.848796979486, + "mem allocated avg": 6894036676.608, + "mem reserved avg": 14687865405.44, + "elapsed time": 1411.115336062001 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.5988378477096558, + "train samples": 13000, + "train time": 44.070030323957326, + "eval time": 10.250203846997465, + "tokens / sec": 4785.587812163363, + "mem allocated avg": 6895260303.36, + "mem reserved avg": 14725043716.096, + "elapsed time": 1523.332073521 + }, + { + "step": 3500, + "valid accuracy": 0.5, + "train loss": 0.5801258901357651, + "train samples": 14000, + "train time": 43.991991777089424, + "eval time": 16.38271237299341, + "tokens / sec": 4767.913238909897, + "mem allocated avg": 6893688922.112, + "mem reserved avg": 14703484993.536, + "elapsed time": 1641.7187374700006 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5768071869611741, + "train samples": 15000, + "train time": 45.04501243098639, + "eval time": 16.454509290000715, + "tokens / sec": 4810.810083180938, + "mem allocated avg": 6905122422.784, + "mem reserved avg": 14891314315.264, + "elapsed time": 1761.645320085001 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5858320169448853, + "train samples": 16000, + "train time": 42.547905418032315, + "eval time": 16.350580427999375, + "tokens / sec": 4803.36218650576, + "mem allocated avg": 6886491265.024, + "mem reserved avg": 14582730981.376, + "elapsed time": 1878.0724109930015 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5723247408866883, + "train samples": 17000, + "train time": 44.19116178697732, + "eval time": 16.508775556001638, + "tokens / sec": 4783.513070305705, + "mem allocated avg": 6897152284.672, + "mem reserved avg": 14738381602.816, + "elapsed time": 1996.8971549050038 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5789256048202515, + "train samples": 18000, + "train time": 43.87211918797402, + "eval time": 16.414912490006827, + "tokens / sec": 4736.903615473535, + "mem allocated avg": 6893093124.096, + "mem reserved avg": 14658832433.152, + "elapsed time": 2114.9650602839974 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.568240401506424, + "train samples": 19000, + "train time": 43.939464293958736, + "eval time": 16.460097985000175, + "tokens / sec": 4777.914418698651, + "mem allocated avg": 6894218592.256, + "mem reserved avg": 14710372040.704, + "elapsed time": 2233.517725938 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.57634852206707, + "train samples": 20000, + "train time": 42.787552905057964, + "eval time": 16.445046182001533, + "tokens / sec": 4867.770785166333, + "mem allocated avg": 6890906441.728, + "mem reserved avg": 14656718503.936, + "elapsed time": 2350.279711092 + }, + { + "step": 5000, + "test accuracy": 0.5170583775587566, + "train loss": 0.57634852206707, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..a473c5827e1b13156e94be30ac5baee3f7350ff6 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/bone--llama-3.2-3B-default.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-20T04:17:11+00:00", + "total_time": 1867.121674144997, + "experiment_name": "bone/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "BONE", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11170837063, + "accelerator_memory_max": 20248002560, + "accelerator_memory_reserved_99th": 16303469363, + "train_time": 1664.0814183089897, + "file_size": 29367496, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.8771067566871643, + "train samples": 1000, + "train time": 29.468342912026856, + "eval time": 11.086663477995899, + "tokens / sec": 7184.625230948821, + "mem allocated avg": 6894354876.416, + "mem reserved avg": 11212691603.456, + "elapsed time": 88.56553585999791 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 0.6947847135066986, + "train samples": 2000, + "train time": 29.13603712292388, + "eval time": 11.12908834600239, + "tokens / sec": 7138.753946615206, + "mem allocated avg": 6887297284.096, + "mem reserved avg": 11116172279.808, + "elapsed time": 169.94219922799675 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6673308206796646, + "train samples": 3000, + "train time": 29.74789179801155, + "eval time": 6.2111000180011615, + "tokens / sec": 7207.267037805055, + "mem allocated avg": 6897885888.512, + "mem reserved avg": 11257109282.816, + "elapsed time": 247.40845895299572 + }, + { + "step": 1000, + "valid accuracy": 0.44, + "train loss": 0.6480507221221924, + "train samples": 4000, + "train time": 29.01437903306214, + "eval time": 11.063560270995367, + "tokens / sec": 7180.439731713689, + "mem allocated avg": 6888501639.168, + "mem reserved avg": 11141564596.224, + "elapsed time": 328.43337820599845 + }, + { + "step": 1250, + "valid accuracy": 0.42, + "train loss": 0.6442041766643524, + "train samples": 5000, + "train time": 28.86099356606428, + "eval time": 11.061821620001865, + "tokens / sec": 7225.600169399779, + "mem allocated avg": 6888334700.544, + "mem reserved avg": 11139123511.296, + "elapsed time": 409.5306018880001 + }, + { + "step": 1500, + "valid accuracy": 0.52, + "train loss": 0.6375475705862045, + "train samples": 6000, + "train time": 29.36598393299937, + "eval time": 6.896059851998871, + "tokens / sec": 7128.349606047729, + "mem allocated avg": 6890338080.768, + "mem reserved avg": 11164893315.072, + "elapsed time": 487.1438905899995 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6282199568748474, + "train samples": 7000, + "train time": 29.2208460940019, + "eval time": 11.139122824002698, + "tokens / sec": 7164.576936838726, + "mem allocated avg": 6891485964.288, + "mem reserved avg": 11174582157.312, + "elapsed time": 568.6407176649955 + }, + { + "step": 2000, + "valid accuracy": 0.44, + "train loss": 0.628275181055069, + "train samples": 8000, + "train time": 28.774674860083906, + "eval time": 11.096917715003656, + "tokens / sec": 7218.013791986054, + "mem allocated avg": 6889055956.992, + "mem reserved avg": 11126481879.04, + "elapsed time": 649.4662010969987 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6164452042579651, + "train samples": 9000, + "train time": 29.666104338008154, + "eval time": 6.740810982002586, + "tokens / sec": 7245.575541396888, + "mem allocated avg": 6899385456.64, + "mem reserved avg": 11287358603.264, + "elapsed time": 727.5584506419982 + }, + { + "step": 2500, + "valid accuracy": 0.52, + "train loss": 0.6124898854494095, + "train samples": 10000, + "train time": 28.952800227045373, + "eval time": 11.054138113999215, + "tokens / sec": 7113.888756349109, + "mem allocated avg": 6884753041.408, + "mem reserved avg": 11077492408.32, + "elapsed time": 808.6757636719994 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.6010023313760757, + "train samples": 11000, + "train time": 29.36040201097785, + "eval time": 5.933361176998005, + "tokens / sec": 7216.556500853691, + "mem allocated avg": 6895703631.872, + "mem reserved avg": 11229007446.016, + "elapsed time": 885.2688505609985 + }, + { + "step": 3000, + "valid accuracy": 0.36, + "train loss": 0.590470621585846, + "train samples": 12000, + "train time": 29.152743853985157, + "eval time": 11.051910919995862, + "tokens / sec": 7159.909236861306, + "mem allocated avg": 6890226739.2, + "mem reserved avg": 11156563427.328, + "elapsed time": 966.2876440099935 + }, + { + "step": 3250, + "valid accuracy": 0.46, + "train loss": 0.5996054347753524, + "train samples": 13000, + "train time": 29.23224936202314, + "eval time": 11.06002619300125, + "tokens / sec": 7214.668888053154, + "mem allocated avg": 6892138940.416, + "mem reserved avg": 11182651998.208, + "elapsed time": 1047.7634995759945 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.5810788285732269, + "train samples": 14000, + "train time": 29.556202010979177, + "eval time": 7.767598452002858, + "tokens / sec": 7096.649289448104, + "mem allocated avg": 6891370110.976, + "mem reserved avg": 11166763974.656, + "elapsed time": 1126.3068484049945 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5778432558774949, + "train samples": 15000, + "train time": 30.077826159038523, + "eval time": 11.010653469995304, + "tokens / sec": 7204.742751493022, + "mem allocated avg": 6901065279.488, + "mem reserved avg": 11319788961.792, + "elapsed time": 1209.0550349339974 + }, + { + "step": 4000, + "valid accuracy": 0.4, + "train loss": 0.5869229323863984, + "train samples": 16000, + "train time": 29.213863794990175, + "eval time": 11.144038623999222, + "tokens / sec": 6995.753845988955, + "mem allocated avg": 6883645001.728, + "mem reserved avg": 11058953584.64, + "elapsed time": 1290.3985370609953 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.5733816763162612, + "train samples": 17000, + "train time": 29.18649683901458, + "eval time": 11.153094029003114, + "tokens / sec": 7242.698607029438, + "mem allocated avg": 6893432758.272, + "mem reserved avg": 11193884344.32, + "elapsed time": 1372.1237251569983 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5803762240409851, + "train samples": 18000, + "train time": 29.077459994943638, + "eval time": 11.118935573998897, + "tokens / sec": 7147.047920834147, + "mem allocated avg": 6888416004.096, + "mem reserved avg": 11124485390.336, + "elapsed time": 1453.4214935309938 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.5692038584947586, + "train samples": 19000, + "train time": 29.40723867896304, + "eval time": 11.099454375005735, + "tokens / sec": 7139.024588193769, + "mem allocated avg": 6890813089.792, + "mem reserved avg": 11168844349.44, + "elapsed time": 1535.6791463129994 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.5775641392469406, + "train samples": 20000, + "train time": 28.941933833950316, + "eval time": 11.18307958800142, + "tokens / sec": 7196.47834159849, + "mem allocated avg": 6887869800.448, + "mem reserved avg": 11118328152.064, + "elapsed time": 1617.277517963994 + }, + { + "step": 5000, + "test accuracy": 0.5079605761940864, + "train loss": 0.5775641392469406, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/c3a--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/c3a--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..dedfb5f28850701d45d209484d9fdff0b0bafa89 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/c3a--llama-3.2-3B-default.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-07-31T13:35:43+00:00", + "total_time": 2124.5942297870006, + "experiment_name": "c3a/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.3, + "weight_decay": 1e-05 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "C3A", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "block_size": 64, + "target_modules": [ + "q_proj", + "v_proj" + ], + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "block_size_pattern": {}, + "init_weights": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11804454210, + "accelerator_memory_max": 22280142848, + "accelerator_memory_reserved_99th": 17825917829, + "train_time": 1924.4760333429977, + "file_size": 22027512, + "num_trainable_params": 5505024, + "num_total_params": 3218254848, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.8909663727283478, + "train samples": 1000, + "train time": 39.02584077800293, + "eval time": 13.030882289000147, + "tokens / sec": 5425.097724463024, + "mem allocated avg": 6868124932.096, + "mem reserved avg": 11851408605.184, + "elapsed time": 101.62553732800006 + }, + { + "step": 500, + "valid accuracy": 0.42, + "train loss": 0.7016348876953125, + "train samples": 2000, + "train time": 38.45351204700091, + "eval time": 7.171581626000261, + "tokens / sec": 5408.9987865288385, + "mem allocated avg": 6861684402.176, + "mem reserved avg": 11746811052.032, + "elapsed time": 190.13871278500028 + }, + { + "step": 750, + "valid accuracy": 0.4, + "train loss": 0.6737332336902618, + "train samples": 3000, + "train time": 38.979470817998845, + "eval time": 7.451876584000274, + "tokens / sec": 5500.356867364139, + "mem allocated avg": 6871526113.28, + "mem reserved avg": 11907578724.352, + "elapsed time": 279.83808459899956 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.654857279419899, + "train samples": 4000, + "train time": 38.61999764599932, + "eval time": 13.0494962570001, + "tokens / sec": 5394.5109450720465, + "mem allocated avg": 6862362261.504, + "mem reserved avg": 11770039107.584, + "elapsed time": 374.5986276450003 + }, + { + "step": 1250, + "valid accuracy": 0.36, + "train loss": 0.6510260388851166, + "train samples": 5000, + "train time": 38.424466599010884, + "eval time": 9.087004377000085, + "tokens / sec": 5427.2191251541835, + "mem allocated avg": 6862755778.56, + "mem reserved avg": 11764250968.064, + "elapsed time": 465.21620532700035 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6443832906484603, + "train samples": 6000, + "train time": 38.60136020600203, + "eval time": 8.362256499000068, + "tokens / sec": 5422.891806995228, + "mem allocated avg": 6864079349.76, + "mem reserved avg": 11795154599.936, + "elapsed time": 555.3471686519997 + }, + { + "step": 1750, + "valid accuracy": 0.46, + "train loss": 0.6348284522294998, + "train samples": 7000, + "train time": 38.819046561000505, + "eval time": 12.987756649999938, + "tokens / sec": 5393.09999979052, + "mem allocated avg": 6865506678.784, + "mem reserved avg": 11806663770.112, + "elapsed time": 650.521843128 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.6368349348306656, + "train samples": 8000, + "train time": 38.522883960999025, + "eval time": 7.980172546000176, + "tokens / sec": 5391.496654567027, + "mem allocated avg": 6861885327.36, + "mem reserved avg": 11751290568.704, + "elapsed time": 740.3195631050003 + }, + { + "step": 2250, + "valid accuracy": 0.54, + "train loss": 0.625629832983017, + "train samples": 9000, + "train time": 39.25956673700239, + "eval time": 12.96675302299991, + "tokens / sec": 5475.047685572397, + "mem allocated avg": 6873176825.856, + "mem reserved avg": 11939161833.472, + "elapsed time": 836.2159785140002 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6208862951993942, + "train samples": 10000, + "train time": 38.55282327599798, + "eval time": 13.027243182000348, + "tokens / sec": 5342.4621726271835, + "mem allocated avg": 6857644570.624, + "mem reserved avg": 11701781004.288, + "elapsed time": 930.913046529 + }, + { + "step": 2750, + "valid accuracy": 0.5, + "train loss": 0.6103025305271149, + "train samples": 11000, + "train time": 38.69634664399382, + "eval time": 8.820525846999772, + "tokens / sec": 5475.478136199886, + "mem allocated avg": 6868590700.544, + "mem reserved avg": 11873739079.68, + "elapsed time": 1021.7124050419998 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.5977142386436463, + "train samples": 12000, + "train time": 38.552098998990004, + "eval time": 8.200822365000022, + "tokens / sec": 5414.257729662615, + "mem allocated avg": 6864444424.192, + "mem reserved avg": 11782135480.32, + "elapsed time": 1111.412057769 + }, + { + "step": 3250, + "valid accuracy": 0.54, + "train loss": 0.6062814455032348, + "train samples": 13000, + "train time": 38.8476868979933, + "eval time": 9.795037900000352, + "tokens / sec": 5428.92040274589, + "mem allocated avg": 6865998022.656, + "mem reserved avg": 11821754875.904, + "elapsed time": 1203.456824805 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5876059620380402, + "train samples": 14000, + "train time": 39.031564167995384, + "eval time": 12.945379363999564, + "tokens / sec": 5373.855864377278, + "mem allocated avg": 6864331821.056, + "mem reserved avg": 11797008482.304, + "elapsed time": 1298.7533704500001 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5839143842458725, + "train samples": 15000, + "train time": 39.49590561498735, + "eval time": 8.033668121999654, + "tokens / sec": 5486.720626498778, + "mem allocated avg": 6875409352.704, + "mem reserved avg": 11980509282.304, + "elapsed time": 1390.233747202 + }, + { + "step": 4000, + "valid accuracy": 0.56, + "train loss": 0.591371684551239, + "train samples": 16000, + "train time": 38.53615990600338, + "eval time": 12.963392601999658, + "tokens / sec": 5303.408551825156, + "mem allocated avg": 6856836122.624, + "mem reserved avg": 11673922437.12, + "elapsed time": 1484.869673702 + }, + { + "step": 4250, + "valid accuracy": 0.56, + "train loss": 0.577637273311615, + "train samples": 17000, + "train time": 38.46427064899399, + "eval time": 7.68621369199991, + "tokens / sec": 5495.723600975878, + "mem allocated avg": 6867260166.144, + "mem reserved avg": 11833297600.512, + "elapsed time": 1574.4202131299999 + }, + { + "step": 4500, + "valid accuracy": 0.56, + "train loss": 0.584571166396141, + "train samples": 18000, + "train time": 38.4389222480022, + "eval time": 7.678759501000968, + "tokens / sec": 5406.447107418601, + "mem allocated avg": 6862379522.048, + "mem reserved avg": 11747851239.424, + "elapsed time": 1663.6057326830005 + }, + { + "step": 4750, + "valid accuracy": 0.58, + "train loss": 0.5745555943250656, + "train samples": 19000, + "train time": 38.87202309699023, + "eval time": 7.697188709000329, + "tokens / sec": 5400.773699793749, + "mem allocated avg": 6863993468.928, + "mem reserved avg": 11795691470.848, + "elapsed time": 1753.6286738400004 + }, + { + "step": 5000, + "valid accuracy": 0.56, + "train loss": 0.5808088963031769, + "train samples": 20000, + "train time": 38.65712555299615, + "eval time": 7.723833553000077, + "tokens / sec": 5387.8811996629975, + "mem allocated avg": 6861167886.336, + "mem reserved avg": 11749034033.152, + "elapsed time": 1843.4390854000003 + }, + { + "step": 5000, + "test accuracy": 0.510235026535254, + "train loss": 0.5808088963031769, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.16.1.dev0", + "peft-commit-hash": "25e5c6b25c4589eb2683484ede1ba3d985d8a760", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1031-aws", + "version": "#33-Ubuntu SMP Fri Jun 20 18:11:07 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..f3bc26876c01612fe60f472439e877c9c3ab1fe8 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-default.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T10:18:57+00:00", + "total_time": 2823.832106703994, + "experiment_name": "fourierft/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "FOURIERFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "n_frequency": 1000, + "scaling": 300, + "random_loc_seed": 777, + "fan_in_fan_out": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "n_frequency_pattern": {}, + "init_weights": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 13104129350, + "accelerator_memory_max": 23653777408, + "accelerator_memory_reserved_99th": 19017267937, + "train_time": 2424.3862988609762, + "file_size": 231416, + "num_trainable_params": 56000, + "num_total_params": 3212805824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3263031902313231, + "train samples": 1000, + "train time": 53.55340486107161, + "eval time": 19.578013352002017, + "tokens / sec": 3953.4180982374883, + "mem allocated avg": 6781303625.728, + "mem reserved avg": 13152850804.736, + "elapsed time": 119.84825310099404 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3399862418174744, + "train samples": 2000, + "train time": 52.85717789203045, + "eval time": 19.544192551999004, + "tokens / sec": 3935.03793231005, + "mem allocated avg": 6774035257.344, + "mem reserved avg": 13043463356.416, + "elapsed time": 233.5829256769939 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.3045952091217041, + "train samples": 3000, + "train time": 53.35706212905643, + "eval time": 19.607110917990212, + "tokens / sec": 4018.2309790861696, + "mem allocated avg": 6783920330.752, + "mem reserved avg": 13205673869.312, + "elapsed time": 348.1469791559939 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.3111453976631164, + "train samples": 4000, + "train time": 52.95546973698947, + "eval time": 19.472347582006478, + "tokens / sec": 3934.1733919976355, + "mem allocated avg": 6776025266.176, + "mem reserved avg": 13077269446.656, + "elapsed time": 461.81266678999236 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.299716483592987, + "train samples": 5000, + "train time": 52.12036712520057, + "eval time": 19.626158429004136, + "tokens / sec": 4001.0846335572023, + "mem allocated avg": 6775331573.76, + "mem reserved avg": 13063344357.376, + "elapsed time": 574.6407375999988 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.2867344057559966, + "train samples": 6000, + "train time": 52.594848359090975, + "eval time": 19.54386943600548, + "tokens / sec": 3980.0666135738998, + "mem allocated avg": 6776458844.16, + "mem reserved avg": 13093568512.0, + "elapsed time": 688.0431025519938 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.2803141210079194, + "train samples": 7000, + "train time": 52.98738884186605, + "eval time": 19.568909612993593, + "tokens / sec": 3951.0344739725274, + "mem allocated avg": 6778496358.4, + "mem reserved avg": 13108768669.696, + "elapsed time": 801.9154772249894 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.2766506419181824, + "train samples": 8000, + "train time": 52.03297274692159, + "eval time": 19.525613270001486, + "tokens / sec": 3991.62279292005, + "mem allocated avg": 6774647097.344, + "mem reserved avg": 13051189264.384, + "elapsed time": 914.5343848449993 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2596003375053406, + "train samples": 9000, + "train time": 53.934016149127274, + "eval time": 19.535415460006334, + "tokens / sec": 3985.388356870549, + "mem allocated avg": 6785830477.824, + "mem reserved avg": 13237223424.0, + "elapsed time": 1029.9007452719961 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.2684449093341827, + "train samples": 10000, + "train time": 52.006629903029534, + "eval time": 19.470633051998448, + "tokens / sec": 3960.3989026791724, + "mem allocated avg": 6771212331.008, + "mem reserved avg": 12996118052.864, + "elapsed time": 1142.5889472209965 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2548872971534728, + "train samples": 11000, + "train time": 53.403087337108445, + "eval time": 19.463876378998975, + "tokens / sec": 3967.579601952513, + "mem allocated avg": 6781916252.16, + "mem reserved avg": 13168084516.864, + "elapsed time": 1257.0122518049902 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.253697858095169, + "train samples": 12000, + "train time": 53.20096563108382, + "eval time": 19.472515105997445, + "tokens / sec": 3923.443823321214, + "mem allocated avg": 6777045135.36, + "mem reserved avg": 13084844359.68, + "elapsed time": 1370.94780872899 + }, + { + "step": 3250, + "valid accuracy": 0.0, + "train loss": 1.248513156414032, + "train samples": 13000, + "train time": 52.962746563891415, + "eval time": 19.54665829600708, + "tokens / sec": 3982.06312328573, + "mem allocated avg": 6779038627.84, + "mem reserved avg": 13110345728.0, + "elapsed time": 1484.7621198889974 + }, + { + "step": 3500, + "valid accuracy": 0.0, + "train loss": 1.2477959940433503, + "train samples": 14000, + "train time": 52.93443578510778, + "eval time": 19.444701158994576, + "tokens / sec": 3962.4489595298505, + "mem allocated avg": 6776803573.76, + "mem reserved avg": 13097142059.008, + "elapsed time": 1598.8772237269877 + }, + { + "step": 3750, + "valid accuracy": 0.0, + "train loss": 1.228544222354889, + "train samples": 15000, + "train time": 53.31031796212483, + "eval time": 19.472959079008433, + "tokens / sec": 4064.9354249577, + "mem allocated avg": 6788200585.216, + "mem reserved avg": 13268999471.104, + "elapsed time": 1713.6814467679942 + }, + { + "step": 4000, + "valid accuracy": 0.0, + "train loss": 1.2609001460075377, + "train samples": 16000, + "train time": 51.9827769130934, + "eval time": 19.473652824002784, + "tokens / sec": 3931.552182017475, + "mem allocated avg": 6770180233.216, + "mem reserved avg": 12983610638.336, + "elapsed time": 1826.5604049959948 + }, + { + "step": 4250, + "valid accuracy": 0.0, + "train loss": 1.227214762210846, + "train samples": 17000, + "train time": 53.09942602888623, + "eval time": 19.547112297004787, + "tokens / sec": 3981.0034836347163, + "mem allocated avg": 6779591426.048, + "mem reserved avg": 13132760088.576, + "elapsed time": 1940.5098487799987 + }, + { + "step": 4500, + "valid accuracy": 0.0, + "train loss": 1.2504195840358734, + "train samples": 18000, + "train time": 52.23909889203787, + "eval time": 19.522137050997117, + "tokens / sec": 3978.207978462565, + "mem allocated avg": 6775933241.344, + "mem reserved avg": 13056079822.848, + "elapsed time": 2053.2267840139975 + }, + { + "step": 4750, + "valid accuracy": 0.0, + "train loss": 1.2349513354301453, + "train samples": 19000, + "train time": 53.36620609794045, + "eval time": 19.541859832999762, + "tokens / sec": 3933.931514912433, + "mem allocated avg": 6777532579.84, + "mem reserved avg": 13101604798.464, + "elapsed time": 2167.8329333979927 + }, + { + "step": 5000, + "valid accuracy": 0.0, + "train loss": 1.2480293517112733, + "train samples": 20000, + "train time": 52.46977503092785, + "eval time": 19.44991449599911, + "tokens / sec": 3969.5234042309344, + "mem allocated avg": 6773533165.568, + "mem reserved avg": 13049645760.512, + "elapsed time": 2281.220151823989 + }, + { + "step": 5000, + "test accuracy": 0.000758150113722517, + "train loss": 1.2480293517112733, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json b/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json new file mode 100644 index 0000000000000000000000000000000000000000..3c7241b5f04142fdc9bb7a3702e4d08c1b91730a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/fourierft--llama-3.2-3B-n_frequency-5000.json @@ -0,0 +1,354 @@ +{ + "run_info": { + "created_at": "2025-06-20T09:31:48+00:00", + "total_time": 2824.376998209991, + "experiment_name": "fourierft/llama-3.2-3B-n_frequency-5000", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "FOURIERFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "n_frequency": 5000, + "scaling": 300, + "random_loc_seed": 777, + "fan_in_fan_out": false, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "bias": "none", + "modules_to_save": null, + "layers_to_transform": null, + "layers_pattern": null, + "n_frequency_pattern": {}, + "init_weights": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 13111221498, + "accelerator_memory_max": 23681040384, + "accelerator_memory_reserved_99th": 19054869872, + "train_time": 2421.913372163006, + "file_size": 1127472, + "num_trainable_params": 280000, + "num_total_params": 3213029824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3800132541656494, + "train samples": 1000, + "train time": 53.57064967796032, + "eval time": 19.631924207002157, + "tokens / sec": 3952.1454616053315, + "mem allocated avg": 6784830552.064, + "mem reserved avg": 13158731218.944, + "elapsed time": 119.20255395398999 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3702282276153563, + "train samples": 2000, + "train time": 53.00863014489005, + "eval time": 19.629790833001607, + "tokens / sec": 3923.7950392508, + "mem allocated avg": 6777176354.816, + "mem reserved avg": 13048941117.44, + "elapsed time": 232.4386439989903 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.3024170677661895, + "train samples": 3000, + "train time": 53.97298614999454, + "eval time": 19.64192995200574, + "tokens / sec": 3972.3760957780855, + "mem allocated avg": 6787548153.856, + "mem reserved avg": 13211654946.816, + "elapsed time": 346.9217278779979 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.2704877371788026, + "train samples": 4000, + "train time": 52.95541349705309, + "eval time": 19.62998814698949, + "tokens / sec": 3934.1775701854103, + "mem allocated avg": 6779591346.176, + "mem reserved avg": 13082126450.688, + "elapsed time": 460.14450727400254 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.2236453666687013, + "train samples": 5000, + "train time": 53.36593960013124, + "eval time": 19.652927816001466, + "tokens / sec": 3907.698460152047, + "mem allocated avg": 6779029788.672, + "mem reserved avg": 13073486184.448, + "elapsed time": 573.5348878969962 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.1792121708393097, + "train samples": 6000, + "train time": 53.3776921518147, + "eval time": 19.616937039012555, + "tokens / sec": 3921.69446750581, + "mem allocated avg": 6779851802.624, + "mem reserved avg": 13098995941.376, + "elapsed time": 686.9838123609952 + }, + { + "step": 1750, + "valid accuracy": 0.02, + "train loss": 1.1485692322254182, + "train samples": 7000, + "train time": 53.188338823019876, + "eval time": 19.653264298991417, + "tokens / sec": 3936.1071361264494, + "mem allocated avg": 6782223466.496, + "mem reserved avg": 13116058370.048, + "elapsed time": 800.3354816049978 + }, + { + "step": 2000, + "valid accuracy": 0.06, + "train loss": 1.1230667443275453, + "train samples": 8000, + "train time": 53.074023688037414, + "eval time": 19.656479785000556, + "tokens / sec": 3913.3268135239105, + "mem allocated avg": 6778141935.616, + "mem reserved avg": 13055400345.6, + "elapsed time": 913.367253695993 + }, + { + "step": 2250, + "valid accuracy": 0.1, + "train loss": 1.094045166015625, + "train samples": 9000, + "train time": 54.34830153394432, + "eval time": 19.628162662993418, + "tokens / sec": 3955.008600696563, + "mem allocated avg": 6789509545.984, + "mem reserved avg": 13248556433.408, + "elapsed time": 1028.463336018991 + }, + { + "step": 2500, + "valid accuracy": 0.12, + "train loss": 1.077717797279358, + "train samples": 10000, + "train time": 52.1458756570355, + "eval time": 19.611369335994823, + "tokens / sec": 3949.823402231256, + "mem allocated avg": 6775024920.576, + "mem reserved avg": 13002233348.096, + "elapsed time": 1140.4990660109906 + }, + { + "step": 2750, + "valid accuracy": 0.12, + "train loss": 1.0569540388584138, + "train samples": 11000, + "train time": 53.227410834049806, + "eval time": 19.625236430001678, + "tokens / sec": 3980.6745562092756, + "mem allocated avg": 6785537161.216, + "mem reserved avg": 13177051938.816, + "elapsed time": 1254.066401210992 + }, + { + "step": 3000, + "valid accuracy": 0.12, + "train loss": 1.0361379137039184, + "train samples": 12000, + "train time": 53.65395914198598, + "eval time": 19.719437510997523, + "tokens / sec": 3890.3186892066865, + "mem allocated avg": 6780720910.336, + "mem reserved avg": 13092201168.896, + "elapsed time": 1367.8724600419955 + }, + { + "step": 3250, + "valid accuracy": 0.16, + "train loss": 1.0240549674034118, + "train samples": 13000, + "train time": 52.97706237102102, + "eval time": 19.7029277440015, + "tokens / sec": 3980.9870642311216, + "mem allocated avg": 6782688188.416, + "mem reserved avg": 13119816466.432, + "elapsed time": 1481.1549517469975 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 1.0098259932994842, + "train samples": 14000, + "train time": 52.869576787008555, + "eval time": 19.597270865997416, + "tokens / sec": 3967.3099870839346, + "mem allocated avg": 6780575592.448, + "mem reserved avg": 13102678540.288, + "elapsed time": 1594.3849144269916 + }, + { + "step": 3750, + "valid accuracy": 0.22, + "train loss": 0.9942408270835876, + "train samples": 15000, + "train time": 54.702630093932385, + "eval time": 19.623511597994366, + "tokens / sec": 3961.4731435744384, + "mem allocated avg": 6792074147.84, + "mem reserved avg": 13278612815.872, + "elapsed time": 1709.9712875620025 + }, + { + "step": 4000, + "valid accuracy": 0.16, + "train loss": 1.0123027296066285, + "train samples": 16000, + "train time": 52.456372838059906, + "eval time": 19.68401901901234, + "tokens / sec": 3896.056645603915, + "mem allocated avg": 6773958766.592, + "mem reserved avg": 12989172285.44, + "elapsed time": 1822.6668115109933 + }, + { + "step": 4250, + "valid accuracy": 0.24, + "train loss": 0.9849327182769776, + "train samples": 17000, + "train time": 53.25562528491719, + "eval time": 19.648335694990237, + "tokens / sec": 3969.3271625123257, + "mem allocated avg": 6783509901.312, + "mem reserved avg": 13139588415.488, + "elapsed time": 1936.0694442329986 + }, + { + "step": 4500, + "valid accuracy": 0.18, + "train loss": 0.9994378657341003, + "train samples": 18000, + "train time": 53.01732904899109, + "eval time": 19.688141086997348, + "tokens / sec": 3919.8127051621955, + "mem allocated avg": 6779470948.352, + "mem reserved avg": 13063528906.752, + "elapsed time": 2048.985867203999 + }, + { + "step": 4750, + "valid accuracy": 0.16, + "train loss": 0.9892346875667573, + "train samples": 19000, + "train time": 53.11992502908106, + "eval time": 19.68838914000662, + "tokens / sec": 3952.1704875348883, + "mem allocated avg": 6781060145.152, + "mem reserved avg": 13109733359.616, + "elapsed time": 2162.7099456459982 + }, + { + "step": 5000, + "valid accuracy": 0.2, + "train loss": 0.9978675174713135, + "train samples": 20000, + "train time": 52.76285280592856, + "eval time": 19.634052573994268, + "tokens / sec": 3947.4741967818154, + "mem allocated avg": 6777472888.832, + "mem reserved avg": 13055861719.04, + "elapsed time": 2275.669019541994 + }, + { + "step": 5000, + "test accuracy": 0.1197877179681577, + "train loss": 0.9978675174713135, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json b/peft/method_comparison/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json new file mode 100644 index 0000000000000000000000000000000000000000..4f15dc9eb28e5cde17637f1ebcfe00e9c01217ae --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/full-finetuning--llama-3.2-3B-lr_0.00001.json @@ -0,0 +1,331 @@ +{ + "run_info": { + "created_at": "2025-06-20T18:02:43+00:00", + "total_time": 3274.9747593409993, + "experiment_name": "full-finetuning/llama-3.2-3B-lr_0.00001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 1e-05 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": null, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 33098872284, + "accelerator_memory_max": 37241225216, + "accelerator_memory_reserved_99th": 33573390254, + "train_time": 3111.3685010060144, + "file_size": 6425499648, + "num_trainable_params": 3212749824, + "num_total_params": 3212749824, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.3, + "train loss": 1.0749022357463838, + "train samples": 1000, + "train time": 90.81602771116013, + "eval time": 10.388541491003707, + "tokens / sec": 2331.295535996918, + "mem allocated avg": 26069449254.912, + "mem reserved avg": 33116739600.384, + "elapsed time": 162.0596859770012 + }, + { + "step": 500, + "valid accuracy": 0.4, + "train loss": 0.7238605101108551, + "train samples": 2000, + "train time": 90.41340426202805, + "eval time": 10.403155545005575, + "tokens / sec": 2300.488535938847, + "mem allocated avg": 26062513567.744, + "mem reserved avg": 33090961408.0, + "elapsed time": 315.86630137299653 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6648618497848511, + "train samples": 3000, + "train time": 91.4961106939445, + "eval time": 5.590419113999815, + "tokens / sec": 2343.27993150631, + "mem allocated avg": 26071394062.336, + "mem reserved avg": 33094367182.848, + "elapsed time": 465.79339110500587 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6407654472589492, + "train samples": 4000, + "train time": 89.8546926038689, + "eval time": 10.434167744999286, + "tokens / sec": 2318.5878662838986, + "mem allocated avg": 26063373086.72, + "mem reserved avg": 33094367182.848, + "elapsed time": 618.5050604129938 + }, + { + "step": 1250, + "valid accuracy": 0.46, + "train loss": 0.6343449921607971, + "train samples": 5000, + "train time": 90.3596406209981, + "eval time": 5.810965301003307, + "tokens / sec": 2307.86663787969, + "mem allocated avg": 26063789404.16, + "mem reserved avg": 33081876545.536, + "elapsed time": 766.6042792719963 + }, + { + "step": 1500, + "valid accuracy": 0.54, + "train loss": 0.6249808443784713, + "train samples": 6000, + "train time": 90.81503154609527, + "eval time": 10.435444819988334, + "tokens / sec": 2305.025901948283, + "mem allocated avg": 26066218485.76, + "mem reserved avg": 33089409515.52, + "elapsed time": 920.292813491993 + }, + { + "step": 1750, + "valid accuracy": 0.46, + "train loss": 0.6174132014513016, + "train samples": 7000, + "train time": 90.68820026615867, + "eval time": 10.286707318999106, + "tokens / sec": 2308.5142210956765, + "mem allocated avg": 26065828059.136, + "mem reserved avg": 33101774323.712, + "elapsed time": 1073.8488811849966 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.618268838763237, + "train samples": 8000, + "train time": 90.44998777209548, + "eval time": 10.380125819006935, + "tokens / sec": 2296.252383398064, + "mem allocated avg": 26062920781.824, + "mem reserved avg": 33096330117.12, + "elapsed time": 1227.2062568730034 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6107994567155838, + "train samples": 9000, + "train time": 91.58726547904371, + "eval time": 10.372407121991273, + "tokens / sec": 2346.920162707366, + "mem allocated avg": 26073357961.216, + "mem reserved avg": 33114382401.536, + "elapsed time": 1381.3805919409933 + }, + { + "step": 2500, + "valid accuracy": 0.54, + "train loss": 0.6089532144069671, + "train samples": 10000, + "train time": 89.29193754095468, + "eval time": 10.391672718993505, + "tokens / sec": 2306.6696240691504, + "mem allocated avg": 26059719045.12, + "mem reserved avg": 33086842601.472, + "elapsed time": 1533.778675338006 + }, + { + "step": 2750, + "valid accuracy": 0.52, + "train loss": 0.6020698472261429, + "train samples": 11000, + "train time": 90.41624103189679, + "eval time": 10.369720178001444, + "tokens / sec": 2343.3953632871467, + "mem allocated avg": 26070059464.704, + "mem reserved avg": 33107805732.864, + "elapsed time": 1686.671367884992 + }, + { + "step": 3000, + "valid accuracy": 0.5, + "train loss": 0.5949549045562744, + "train samples": 12000, + "train time": 90.9437831780233, + "eval time": 7.315949440002441, + "tokens / sec": 2295.165130654474, + "mem allocated avg": 26064854972.416, + "mem reserved avg": 33098074947.584, + "elapsed time": 1837.2926549609983 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.6066494225263596, + "train samples": 13000, + "train time": 90.87308476005273, + "eval time": 5.963120047992561, + "tokens / sec": 2320.8302057410824, + "mem allocated avg": 26066388537.344, + "mem reserved avg": 33098318217.216, + "elapsed time": 1986.6408478410012 + }, + { + "step": 3500, + "valid accuracy": 0.48, + "train loss": 0.592242598772049, + "train samples": 14000, + "train time": 90.65281462905114, + "eval time": 7.1309342330059735, + "tokens / sec": 2313.7726154261322, + "mem allocated avg": 26065652588.544, + "mem reserved avg": 33100457312.256, + "elapsed time": 2137.073564691993 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.5925718579292297, + "train samples": 15000, + "train time": 91.80342563094746, + "eval time": 5.844810517999576, + "tokens / sec": 2360.5110431407275, + "mem allocated avg": 26075058659.328, + "mem reserved avg": 33131771985.92, + "elapsed time": 2287.0305021950044 + }, + { + "step": 4000, + "valid accuracy": 0.5, + "train loss": 0.6050453131198883, + "train samples": 16000, + "train time": 89.85742108603881, + "eval time": 5.86809825799719, + "tokens / sec": 2274.414261280792, + "mem allocated avg": 26058425257.984, + "mem reserved avg": 33098662150.144, + "elapsed time": 2435.1958582270017 + }, + { + "step": 4250, + "valid accuracy": 0.48, + "train loss": 0.5929686036109925, + "train samples": 17000, + "train time": 90.97368233802263, + "eval time": 5.8907580230006715, + "tokens / sec": 2323.6280489841133, + "mem allocated avg": 26067367372.8, + "mem reserved avg": 33099207409.664, + "elapsed time": 2584.8373482140014 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.6010294322967529, + "train samples": 18000, + "train time": 90.13679483698797, + "eval time": 6.106882603999111, + "tokens / sec": 2305.5845326632484, + "mem allocated avg": 26064599832.576, + "mem reserved avg": 33092253253.632, + "elapsed time": 2733.494644669001 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5936577550172806, + "train samples": 19000, + "train time": 90.74229130300228, + "eval time": 5.885364143003244, + "tokens / sec": 2313.5739354319567, + "mem allocated avg": 26065537388.544, + "mem reserved avg": 33100717359.104, + "elapsed time": 2882.6415541759925 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.5987544150352478, + "train samples": 20000, + "train time": 90.54863398504676, + "eval time": 5.88336711798911, + "tokens / sec": 2300.2003545895063, + "mem allocated avg": 26062803286.016, + "mem reserved avg": 33083126448.128, + "elapsed time": 3031.523533478001 + }, + { + "step": 5000, + "test accuracy": 0.5003790750568613, + "train loss": 0.5987544150352478, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..50d2efa1f91faafd63fdf9ab606668fd2c40c3c6 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-default.json @@ -0,0 +1,351 @@ +{ + "run_info": { + "created_at": "2025-06-19T21:59:33+00:00", + "total_time": 2004.8640038169979, + "experiment_name": "ia3/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "down_proj", + "v_proj", + "k_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12023227429, + "accelerator_memory_max": 23137878016, + "accelerator_memory_reserved_99th": 18398566154, + "train_time": 1782.9318781230104, + "file_size": 1157064, + "num_trainable_params": 286720, + "num_total_params": 3213036544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3155810165405273, + "train samples": 1000, + "train time": 30.56459548201383, + "eval time": 10.972947114001727, + "tokens / sec": 6926.936105684404, + "mem allocated avg": 6780994971.648, + "mem reserved avg": 12076433014.784, + "elapsed time": 90.53726772200025 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.205229633808136, + "train samples": 2000, + "train time": 30.221456803970796, + "eval time": 10.954313254995213, + "tokens / sec": 6882.361805029583, + "mem allocated avg": 6773721065.472, + "mem reserved avg": 11963673346.048, + "elapsed time": 175.07058417100052 + }, + { + "step": 750, + "valid accuracy": 0.1, + "train loss": 1.0194582087993622, + "train samples": 3000, + "train time": 30.774312397006724, + "eval time": 10.944943730006344, + "tokens / sec": 6966.881899231445, + "mem allocated avg": 6784231882.752, + "mem reserved avg": 12126680776.704, + "elapsed time": 260.540154495 + }, + { + "step": 1000, + "valid accuracy": 0.24, + "train loss": 0.9196457831859589, + "train samples": 4000, + "train time": 30.61534244806535, + "eval time": 10.960088267995161, + "tokens / sec": 6804.95409624808, + "mem allocated avg": 6775492155.392, + "mem reserved avg": 11986893012.992, + "elapsed time": 345.30987053900026 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.8685842225551605, + "train samples": 5000, + "train time": 29.97266351111466, + "eval time": 10.924794500999269, + "tokens / sec": 6957.606551138459, + "mem allocated avg": 6775089207.296, + "mem reserved avg": 11983428517.888, + "elapsed time": 429.5542291879974 + }, + { + "step": 1500, + "valid accuracy": 0.32, + "train loss": 0.8332846148014068, + "train samples": 6000, + "train time": 29.98314001694962, + "eval time": 10.942266878999362, + "tokens / sec": 6981.6236685572, + "mem allocated avg": 6776724867.072, + "mem reserved avg": 12008594341.888, + "elapsed time": 513.8152235820016 + }, + { + "step": 1750, + "valid accuracy": 0.32, + "train loss": 0.8169269208908081, + "train samples": 7000, + "train time": 30.245623568014707, + "eval time": 10.940915298000618, + "tokens / sec": 6921.8278647558345, + "mem allocated avg": 6777912934.4, + "mem reserved avg": 12032065667.072, + "elapsed time": 598.2868188970024 + }, + { + "step": 2000, + "valid accuracy": 0.32, + "train loss": 0.8072074156999588, + "train samples": 8000, + "train time": 30.292844633964705, + "eval time": 10.95617212200159, + "tokens / sec": 6856.272578875894, + "mem allocated avg": 6775099170.816, + "mem reserved avg": 11967473385.472, + "elapsed time": 682.7948923380027 + }, + { + "step": 2250, + "valid accuracy": 0.32, + "train loss": 0.7952859619855881, + "train samples": 9000, + "train time": 31.20892413101683, + "eval time": 10.942549917002907, + "tokens / sec": 6887.388975590319, + "mem allocated avg": 6786161477.632, + "mem reserved avg": 12167709458.432, + "elapsed time": 768.9645714229991 + }, + { + "step": 2500, + "valid accuracy": 0.28, + "train loss": 0.7890167078971863, + "train samples": 10000, + "train time": 30.187670495011844, + "eval time": 10.954304017002869, + "tokens / sec": 6822.884860692832, + "mem allocated avg": 6771082014.72, + "mem reserved avg": 11910984499.2, + "elapsed time": 853.427360558002 + }, + { + "step": 2750, + "valid accuracy": 0.3, + "train loss": 0.7823473591804504, + "train samples": 11000, + "train time": 30.410061570059042, + "eval time": 10.93302121299348, + "tokens / sec": 6967.4636965751015, + "mem allocated avg": 6782254225.408, + "mem reserved avg": 12090903363.584, + "elapsed time": 938.3584665200033 + }, + { + "step": 3000, + "valid accuracy": 0.24, + "train loss": 0.7709820411205291, + "train samples": 12000, + "train time": 30.02989622000314, + "eval time": 10.940404225999373, + "tokens / sec": 6950.773271769175, + "mem allocated avg": 6776725577.728, + "mem reserved avg": 12003133358.08, + "elapsed time": 1022.4627897890023 + }, + { + "step": 3250, + "valid accuracy": 0.3, + "train loss": 0.7755767168998718, + "train samples": 13000, + "train time": 30.172652364024543, + "eval time": 10.940153044000908, + "tokens / sec": 6989.806446431653, + "mem allocated avg": 6778589339.648, + "mem reserved avg": 12038298402.816, + "elapsed time": 1107.0076802080002 + }, + { + "step": 3500, + "valid accuracy": 0.34, + "train loss": 0.7658302361965179, + "train samples": 14000, + "train time": 30.384311634006735, + "eval time": 10.941136569999799, + "tokens / sec": 6903.233567590308, + "mem allocated avg": 6777534660.608, + "mem reserved avg": 12020623605.76, + "elapsed time": 1191.893303306002 + }, + { + "step": 3750, + "valid accuracy": 0.34, + "train loss": 0.7585167481899261, + "train samples": 15000, + "train time": 31.250990667955193, + "eval time": 10.924158087997057, + "tokens / sec": 6934.276173913666, + "mem allocated avg": 6788426940.416, + "mem reserved avg": 12209652498.432, + "elapsed time": 1278.4574160839984 + }, + { + "step": 4000, + "valid accuracy": 0.26, + "train loss": 0.7766438691616059, + "train samples": 16000, + "train time": 30.222231689898763, + "eval time": 10.98030305699649, + "tokens / sec": 6762.339793335249, + "mem allocated avg": 6769563977.728, + "mem reserved avg": 11885533462.528, + "elapsed time": 1362.9405450319973 + }, + { + "step": 4250, + "valid accuracy": 0.34, + "train loss": 0.7542061095237732, + "train samples": 17000, + "train time": 30.273203028933494, + "eval time": 10.948997009996674, + "tokens / sec": 6982.710081849145, + "mem allocated avg": 6780103426.048, + "mem reserved avg": 12047483928.576, + "elapsed time": 1447.661586811002 + }, + { + "step": 4500, + "valid accuracy": 0.32, + "train loss": 0.7659628703594208, + "train samples": 18000, + "train time": 29.84466753601737, + "eval time": 10.942651322002348, + "tokens / sec": 6963.320993581165, + "mem allocated avg": 6775043430.4, + "mem reserved avg": 11968387743.744, + "elapsed time": 1531.5572027719973 + }, + { + "step": 4750, + "valid accuracy": 0.28, + "train loss": 0.7580052223205567, + "train samples": 19000, + "train time": 30.03731635398435, + "eval time": 10.927273799999966, + "tokens / sec": 6989.272860661278, + "mem allocated avg": 6776962899.968, + "mem reserved avg": 12017695981.568, + "elapsed time": 1615.9832882379997 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.7657463653087616, + "train samples": 20000, + "train time": 30.07570726004633, + "eval time": 10.953207714999735, + "tokens / sec": 6925.19042691597, + "mem allocated avg": 6774270615.552, + "mem reserved avg": 11958900228.096, + "elapsed time": 1700.4354192270039 + }, + { + "step": 5000, + "test accuracy": 0.34495830174374525, + "train loss": 0.7657463653087616, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json b/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..f1982e8ef38015c53cc79f7f134677ee584bdfeb --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/ia3--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,350 @@ +{ + "run_info": { + "created_at": "2025-06-19T21:27:27+00:00", + "total_time": 1921.5641919770023, + "experiment_name": "ia3/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "IA3", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "k_proj", + "down_proj", + "v_proj" + ], + "exclude_modules": null, + "feedforward_modules": [ + "down_proj" + ], + "fan_in_fan_out": false, + "modules_to_save": null, + "init_ia3_weights": true + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12023331867, + "accelerator_memory_max": 23135780864, + "accelerator_memory_reserved_99th": 18398356439, + "train_time": 1746.0246657649877, + "file_size": 1157064, + "num_trainable_params": 286720, + "num_total_params": 3213036544, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.18, + "train loss": 1.1670710837841034, + "train samples": 1000, + "train time": 30.829080988976784, + "eval time": 10.962777282999014, + "tokens / sec": 6867.509286952213, + "mem allocated avg": 6781095491.584, + "mem reserved avg": 12075594153.984, + "elapsed time": 91.04478788100096 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.8285422480106354, + "train samples": 2000, + "train time": 30.237734625952726, + "eval time": 10.93798775599862, + "tokens / sec": 6878.656836331916, + "mem allocated avg": 6773575256.064, + "mem reserved avg": 11961039323.136, + "elapsed time": 175.57074494799963 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.7387537934780121, + "train samples": 3000, + "train time": 30.784141963005823, + "eval time": 10.918857135002327, + "tokens / sec": 6964.657330961239, + "mem allocated avg": 6784163356.672, + "mem reserved avg": 12124793339.904, + "elapsed time": 261.120397177001 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.7030822492837906, + "train samples": 4000, + "train time": 30.625773959025537, + "eval time": 6.545184372997028, + "tokens / sec": 6802.636246147914, + "mem allocated avg": 6775321157.632, + "mem reserved avg": 11986549080.064, + "elapsed time": 341.78445690100125 + }, + { + "step": 1250, + "valid accuracy": 0.34, + "train loss": 0.6953592277765274, + "train samples": 5000, + "train time": 30.090904191973095, + "eval time": 7.180137749001005, + "tokens / sec": 6930.266989305977, + "mem allocated avg": 6774968741.888, + "mem reserved avg": 11983218802.688, + "elapsed time": 422.45400445199994 + }, + { + "step": 1500, + "valid accuracy": 0.34, + "train loss": 0.6861299908161164, + "train samples": 6000, + "train time": 30.086008766014857, + "eval time": 10.923475695002708, + "tokens / sec": 6957.75241003254, + "mem allocated avg": 6776914077.696, + "mem reserved avg": 12007201832.96, + "elapsed time": 506.8615667560007 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.6775313948392868, + "train samples": 7000, + "train time": 30.329398032976314, + "eval time": 7.039293795001868, + "tokens / sec": 6902.708710946855, + "mem allocated avg": 6778176180.224, + "mem reserved avg": 12032417988.608, + "elapsed time": 587.730657346001 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6783386437892914, + "train samples": 8000, + "train time": 30.340071335995162, + "eval time": 8.14293124300093, + "tokens / sec": 6845.600252547578, + "mem allocated avg": 6775202904.064, + "mem reserved avg": 11967733432.32, + "elapsed time": 669.6239550099999 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6720720986127854, + "train samples": 9000, + "train time": 31.104124111985584, + "eval time": 7.4280358140022145, + "tokens / sec": 6910.594853149151, + "mem allocated avg": 6785809762.304, + "mem reserved avg": 12167885619.2, + "elapsed time": 752.2532132060005 + }, + { + "step": 2500, + "valid accuracy": 0.46, + "train loss": 0.6705386472940444, + "train samples": 10000, + "train time": 30.09476044199255, + "eval time": 7.5499184540021815, + "tokens / sec": 6843.948812850663, + "mem allocated avg": 6770963554.304, + "mem reserved avg": 11912058241.024, + "elapsed time": 833.2611769709983 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.6631126835346222, + "train samples": 11000, + "train time": 30.640666239018174, + "eval time": 10.92325482400338, + "tokens / sec": 6915.025879241109, + "mem allocated avg": 6781913962.496, + "mem reserved avg": 12090299383.808, + "elapsed time": 918.4276470139994 + }, + { + "step": 3000, + "valid accuracy": 0.38, + "train loss": 0.6557366658449173, + "train samples": 12000, + "train time": 30.612569437977072, + "eval time": 10.933225860997482, + "tokens / sec": 6818.473712992361, + "mem allocated avg": 6776591689.728, + "mem reserved avg": 12003032694.784, + "elapsed time": 1003.438990486 + }, + { + "step": 3250, + "valid accuracy": 0.44, + "train loss": 0.6655691808462143, + "train samples": 13000, + "train time": 30.508301533980557, + "eval time": 7.2082155700009025, + "tokens / sec": 6912.905320707402, + "mem allocated avg": 6778600480.768, + "mem reserved avg": 12040143896.576, + "elapsed time": 1084.7670880670012 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.6528272937536239, + "train samples": 14000, + "train time": 30.571383574966603, + "eval time": 7.452295711998886, + "tokens / sec": 6860.991406740058, + "mem allocated avg": 6777338779.648, + "mem reserved avg": 12021227585.536, + "elapsed time": 1166.2843480039992 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.6513591132164002, + "train samples": 15000, + "train time": 31.176262214954477, + "eval time": 6.50122426100279, + "tokens / sec": 6950.897400909496, + "mem allocated avg": 6788519866.368, + "mem reserved avg": 12209543446.528, + "elapsed time": 1248.1537826940003 + }, + { + "step": 4000, + "valid accuracy": 0.42, + "train loss": 0.6660103598833084, + "train samples": 16000, + "train time": 30.1621740100818, + "eval time": 10.007692241000768, + "tokens / sec": 6775.804686084222, + "mem allocated avg": 6769538811.904, + "mem reserved avg": 11886321991.68, + "elapsed time": 1331.4140659110017 + }, + { + "step": 4250, + "valid accuracy": 0.4, + "train loss": 0.648773505806923, + "train samples": 17000, + "train time": 30.627343150990782, + "eval time": 9.851157391000015, + "tokens / sec": 6901.969882201866, + "mem allocated avg": 6780366684.16, + "mem reserved avg": 12050411552.768, + "elapsed time": 1415.4855422520013 + }, + { + "step": 4500, + "valid accuracy": 0.42, + "train loss": 0.6574939725399017, + "train samples": 18000, + "train time": 30.04905582394713, + "eval time": 6.792122120001295, + "tokens / sec": 6915.957733167199, + "mem allocated avg": 6775072815.104, + "mem reserved avg": 11969042055.168, + "elapsed time": 1495.5897211369993 + }, + { + "step": 4750, + "valid accuracy": 0.4, + "train loss": 0.6505398267507553, + "train samples": 19000, + "train time": 30.326544256924535, + "eval time": 7.6139581239986, + "tokens / sec": 6922.615324100572, + "mem allocated avg": 6777039572.992, + "mem reserved avg": 12019256262.656, + "elapsed time": 1577.114852814997 + }, + { + "step": 5000, + "valid accuracy": 0.42, + "train loss": 0.6568749620914459, + "train samples": 20000, + "train time": 30.342653310064634, + "eval time": 6.5661308569979155, + "tokens / sec": 6864.264567492972, + "mem allocated avg": 6774530805.76, + "mem reserved avg": 11958866673.664, + "elapsed time": 1657.5746541439985 + }, + { + "step": 5000, + "test accuracy": 0.41243366186504926, + "train loss": 0.6568749620914459, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8dabe44e54439aad25b203985f6b693c33004a --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/ln_tuning--llama-3.2-3B-default.json @@ -0,0 +1,346 @@ +{ + "run_info": { + "created_at": "2025-06-20T11:06:05+00:00", + "total_time": 1870.2496634349955, + "experiment_name": "ln_tuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LN_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "target_modules": [ + "input_layernorm", + "norm", + "post_attention_layernorm" + ], + "exclude_modules": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11385589622, + "accelerator_memory_max": 21177040896, + "accelerator_memory_reserved_99th": 16903066091, + "train_time": 1657.2412179829698, + "file_size": 358288, + "num_trainable_params": 175104, + "num_total_params": 3212924928, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3265725662708283, + "train samples": 1000, + "train time": 27.216289801202947, + "eval time": 10.492610957997385, + "tokens / sec": 7779.127924726981, + "mem allocated avg": 6780187711.488, + "mem reserved avg": 11433404268.544, + "elapsed time": 87.52968039299594 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 1.3411514971256255, + "train samples": 2000, + "train time": 26.650248568999814, + "eval time": 10.469055254012346, + "tokens / sec": 7804.617636547848, + "mem allocated avg": 6772587255.808, + "mem reserved avg": 11331533012.992, + "elapsed time": 165.70980707599665 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.2789560747146607, + "train samples": 3000, + "train time": 27.327283490114496, + "eval time": 10.448594682005933, + "tokens / sec": 7845.6755526965735, + "mem allocated avg": 6783227031.552, + "mem reserved avg": 11478560145.408, + "elapsed time": 245.043655957008 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.263298665046692, + "train samples": 4000, + "train time": 26.423721938888775, + "eval time": 10.48604148600134, + "tokens / sec": 7884.430531089723, + "mem allocated avg": 6773840521.216, + "mem reserved avg": 11353729269.76, + "elapsed time": 322.872066240001 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.2484543447494507, + "train samples": 5000, + "train time": 26.471019316944876, + "eval time": 10.449013539997395, + "tokens / sec": 7877.973926999808, + "mem allocated avg": 6774061203.456, + "mem reserved avg": 11347018383.36, + "elapsed time": 400.9208664790058 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.2315508608818053, + "train samples": 6000, + "train time": 26.742762298934394, + "eval time": 10.483759973009, + "tokens / sec": 7827.575837531978, + "mem allocated avg": 6775242500.096, + "mem reserved avg": 11381990490.112, + "elapsed time": 479.3449222920026 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.2309930021762847, + "train samples": 7000, + "train time": 26.920282723149285, + "eval time": 10.450218685000436, + "tokens / sec": 7776.849974163588, + "mem allocated avg": 6777141585.92, + "mem reserved avg": 11390496538.624, + "elapsed time": 558.0740258180012 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.2312077372074126, + "train samples": 8000, + "train time": 26.66103798917902, + "eval time": 10.460793553007534, + "tokens / sec": 7790.244328982917, + "mem allocated avg": 6774060457.984, + "mem reserved avg": 11331650453.504, + "elapsed time": 636.1544087350048 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2250888612270354, + "train samples": 9000, + "train time": 27.347798306160257, + "eval time": 10.455188547988655, + "tokens / sec": 7859.791768011602, + "mem allocated avg": 6784883898.368, + "mem reserved avg": 11515327414.272, + "elapsed time": 715.6408126690076 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.237301394701004, + "train samples": 10000, + "train time": 26.43946731692995, + "eval time": 10.463690039992798, + "tokens / sec": 7790.134253881636, + "mem allocated avg": 6770695682.048, + "mem reserved avg": 11285622161.408, + "elapsed time": 793.5970988850022 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2318837890625, + "train samples": 11000, + "train time": 27.072892207099358, + "eval time": 10.45099154500349, + "tokens / sec": 7826.3156510643585, + "mem allocated avg": 6780353579.008, + "mem reserved avg": 11444477231.104, + "elapsed time": 872.6363771199976 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.2326687624454498, + "train samples": 12000, + "train time": 26.88052615702327, + "eval time": 10.466728950996185, + "tokens / sec": 7765.138181473555, + "mem allocated avg": 6776020297.728, + "mem reserved avg": 11370128998.4, + "elapsed time": 951.0341247300094 + }, + { + "step": 3250, + "valid accuracy": 0.0, + "train loss": 1.2315667741298675, + "train samples": 13000, + "train time": 26.58970486979524, + "eval time": 10.440216451999731, + "tokens / sec": 7931.678859646707, + "mem allocated avg": 6777846503.424, + "mem reserved avg": 11400059551.744, + "elapsed time": 1029.1816804429982 + }, + { + "step": 3500, + "valid accuracy": 0.0, + "train loss": 1.232551732301712, + "train samples": 14000, + "train time": 26.459182894948754, + "eval time": 10.444537474992103, + "tokens / sec": 7927.304514004579, + "mem allocated avg": 6776805982.208, + "mem reserved avg": 11380858028.032, + "elapsed time": 1107.2912037770002 + }, + { + "step": 3750, + "valid accuracy": 0.0, + "train loss": 1.2162783181667327, + "train samples": 15000, + "train time": 27.070398101161118, + "eval time": 10.439593077011523, + "tokens / sec": 8005.164873829656, + "mem allocated avg": 6786829993.984, + "mem reserved avg": 11549276110.848, + "elapsed time": 1186.5560989549995 + }, + { + "step": 4000, + "valid accuracy": 0.0, + "train loss": 1.2475486118793488, + "train samples": 16000, + "train time": 26.172411711973837, + "eval time": 10.464052501003607, + "tokens / sec": 7808.71867098513, + "mem allocated avg": 6768875591.68, + "mem reserved avg": 11260808658.944, + "elapsed time": 1263.9761855469987 + }, + { + "step": 4250, + "valid accuracy": 0.0, + "train loss": 1.2161538779735566, + "train samples": 17000, + "train time": 26.80681787095091, + "eval time": 10.449379954006872, + "tokens / sec": 7885.643160543526, + "mem allocated avg": 6779425828.864, + "mem reserved avg": 11415653974.016, + "elapsed time": 1342.6529225870036 + }, + { + "step": 4500, + "valid accuracy": 0.0, + "train loss": 1.2418145356178283, + "train samples": 18000, + "train time": 26.542597533072694, + "eval time": 10.46835913900577, + "tokens / sec": 7829.602952049208, + "mem allocated avg": 6773693413.376, + "mem reserved avg": 11331625287.68, + "elapsed time": 1420.7107766840054 + }, + { + "step": 4750, + "valid accuracy": 0.0, + "train loss": 1.2255646660327912, + "train samples": 19000, + "train time": 26.923357297797338, + "eval time": 10.45829652599059, + "tokens / sec": 7797.653081593045, + "mem allocated avg": 6775938277.376, + "mem reserved avg": 11381587836.928, + "elapsed time": 1499.7020156110084 + }, + { + "step": 5000, + "valid accuracy": 0.0, + "train loss": 1.2370348122119903, + "train samples": 20000, + "train time": 26.415459764844854, + "eval time": 10.446229163004318, + "tokens / sec": 7884.776636641793, + "mem allocated avg": 6773129859.072, + "mem reserved avg": 11327984631.808, + "elapsed time": 1577.772203030996 + }, + { + "step": 5000, + "test accuracy": 0.0, + "train loss": 1.2370348122119903, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/loha--llama-3.2-3B-rank32.json b/peft/method_comparison/MetaMathQA/results/loha--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..7baa8eb00fbb4baea00d55b0bd56b06f286ec04d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/loha--llama-3.2-3B-rank32.json @@ -0,0 +1,355 @@ +{ + "run_info": { + "created_at": "2025-06-19T16:12:05+00:00", + "total_time": 2590.9341236870005, + "experiment_name": "loha/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LOHA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "rank_pattern": {}, + "alpha_pattern": {}, + "r": 32, + "alpha": 64, + "rank_dropout": 0.0, + "module_dropout": 0.0, + "use_effective_conv2d": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 13446820344, + "accelerator_memory_max": 23886561280, + "accelerator_memory_reserved_99th": 19247870771, + "train_time": 2340.7451966560056, + "file_size": 73429560, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.2914833688735963, + "train samples": 1000, + "train time": 47.4107696449737, + "eval time": 14.298813604000316, + "tokens / sec": 4465.630943885038, + "mem allocated avg": 7073903032.32, + "mem reserved avg": 13501707845.632, + "elapsed time": 120.40146815400112 + }, + { + "step": 500, + "valid accuracy": 0.36, + "train loss": 0.9051123185157776, + "train samples": 2000, + "train time": 47.1910586300055, + "eval time": 14.155041256999539, + "tokens / sec": 4407.508668766131, + "mem allocated avg": 7065529796.608, + "mem reserved avg": 13391154380.8, + "elapsed time": 234.63223427900084 + }, + { + "step": 750, + "valid accuracy": 0.34, + "train loss": 0.7515897085666656, + "train samples": 3000, + "train time": 48.203471163995346, + "eval time": 14.26281827999992, + "tokens / sec": 4447.8332124791605, + "mem allocated avg": 7076336949.248, + "mem reserved avg": 13550454046.72, + "elapsed time": 350.3563782780002 + }, + { + "step": 1000, + "valid accuracy": 0.4, + "train loss": 0.7082941273450851, + "train samples": 4000, + "train time": 47.1758063940124, + "eval time": 14.148222272999192, + "tokens / sec": 4416.161925457669, + "mem allocated avg": 7067704416.256, + "mem reserved avg": 13415607173.12, + "elapsed time": 464.4216197680016 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.6994056793451309, + "train samples": 5000, + "train time": 47.32811543400385, + "eval time": 14.265782994998517, + "tokens / sec": 4406.218123998481, + "mem allocated avg": 7067674988.544, + "mem reserved avg": 13411052158.976, + "elapsed time": 578.864341718001 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.6889224811792374, + "train samples": 6000, + "train time": 47.48961014100678, + "eval time": 9.485757000999001, + "tokens / sec": 4407.932585221307, + "mem allocated avg": 7068496666.624, + "mem reserved avg": 13434196328.448, + "elapsed time": 688.8339757740014 + }, + { + "step": 1750, + "valid accuracy": 0.36, + "train loss": 0.6795901688337326, + "train samples": 7000, + "train time": 47.5112849769921, + "eval time": 8.524607335999463, + "tokens / sec": 4406.426812101222, + "mem allocated avg": 7070726457.344, + "mem reserved avg": 13451493638.144, + "elapsed time": 797.8088079910012 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.680127969622612, + "train samples": 8000, + "train time": 47.15311444799954, + "eval time": 14.09636382700046, + "tokens / sec": 4404.714353047605, + "mem allocated avg": 7067623004.16, + "mem reserved avg": 13389174669.312, + "elapsed time": 911.8939753530012 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6731046036481857, + "train samples": 9000, + "train time": 48.44002798400652, + "eval time": 14.30888277199847, + "tokens / sec": 4437.404538060332, + "mem allocated avg": 7078766321.664, + "mem reserved avg": 13582146207.744, + "elapsed time": 1028.5093922580018 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6711453741788864, + "train samples": 10000, + "train time": 46.86391301901131, + "eval time": 8.751619284999833, + "tokens / sec": 4395.002182520381, + "mem allocated avg": 7063469082.624, + "mem reserved avg": 13336376770.56, + "elapsed time": 1137.0856770440005 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.6645345565080643, + "train samples": 11000, + "train time": 47.92562343400823, + "eval time": 7.835686906000774, + "tokens / sec": 4421.037950434847, + "mem allocated avg": 7074535438.336, + "mem reserved avg": 13512352989.184, + "elapsed time": 1246.1237790820014 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6566170369386672, + "train samples": 12000, + "train time": 47.50991778100797, + "eval time": 14.152554526999666, + "tokens / sec": 4393.419516365485, + "mem allocated avg": 7068629661.696, + "mem reserved avg": 13428215250.944, + "elapsed time": 1360.8028930970013 + }, + { + "step": 3250, + "valid accuracy": 0.42, + "train loss": 0.6667062133550644, + "train samples": 13000, + "train time": 47.62723316902702, + "eval time": 14.332656014001259, + "tokens / sec": 4428.1598146069355, + "mem allocated avg": 7071043653.632, + "mem reserved avg": 13457114005.504, + "elapsed time": 1476.0946507730005 + }, + { + "step": 3500, + "valid accuracy": 0.42, + "train loss": 0.6537795497179031, + "train samples": 14000, + "train time": 47.07006615899445, + "eval time": 14.135684340000807, + "tokens / sec": 4456.12290604184, + "mem allocated avg": 7069669969.92, + "mem reserved avg": 13439749586.944, + "elapsed time": 1590.4238928290015 + }, + { + "step": 3750, + "valid accuracy": 0.46, + "train loss": 0.6509792991876602, + "train samples": 15000, + "train time": 48.58318820000204, + "eval time": 14.298812560000442, + "tokens / sec": 4460.452432802484, + "mem allocated avg": 7081669246.976, + "mem reserved avg": 13624240242.688, + "elapsed time": 1707.3096692510007 + }, + { + "step": 4000, + "valid accuracy": 0.46, + "train loss": 0.6675102390050888, + "train samples": 16000, + "train time": 46.83876558602242, + "eval time": 14.188353157000165, + "tokens / sec": 4363.330191199334, + "mem allocated avg": 7062227976.192, + "mem reserved avg": 13316957143.04, + "elapsed time": 1821.413719397 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6494157313108444, + "train samples": 17000, + "train time": 46.9989987980116, + "eval time": 8.258924301999286, + "tokens / sec": 4497.7341093688, + "mem allocated avg": 7072862310.4, + "mem reserved avg": 13470619664.384, + "elapsed time": 1930.0706906220003 + }, + { + "step": 4500, + "valid accuracy": 0.44, + "train loss": 0.6580193819999695, + "train samples": 18000, + "train time": 47.171681194990015, + "eval time": 9.717189478000364, + "tokens / sec": 4405.566957449713, + "mem allocated avg": 7068038127.616, + "mem reserved avg": 13393654185.984, + "elapsed time": 2040.1967968460012 + }, + { + "step": 4750, + "valid accuracy": 0.48, + "train loss": 0.6511869616508484, + "train samples": 19000, + "train time": 47.517527918005726, + "eval time": 14.28858694399969, + "tokens / sec": 4418.138089217562, + "mem allocated avg": 7069871403.008, + "mem reserved avg": 13443927113.728, + "elapsed time": 2155.4679406510004 + }, + { + "step": 5000, + "valid accuracy": 0.46, + "train loss": 0.6569721374511719, + "train samples": 20000, + "train time": 46.99870921700858, + "eval time": 9.378413720998651, + "tokens / sec": 4431.6110691104805, + "mem allocated avg": 7066192863.232, + "mem reserved avg": 13386213490.688, + "elapsed time": 2265.1425104650007 + }, + { + "step": 5000, + "test accuracy": 0.4184988627748294, + "train loss": 0.6569721374511719, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json b/peft/method_comparison/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..07ae5b1b5a95ddd9a199e1bc5e3f5d6c01b2557f --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lokr--llama-3.2-3B-rank32.json @@ -0,0 +1,358 @@ +{ + "run_info": { + "created_at": "2025-06-19T22:33:02+00:00", + "total_time": 2351.995087948999, + "experiment_name": "lokr/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "LOKR", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "rank_pattern": {}, + "alpha_pattern": {}, + "r": 32, + "alpha": 64, + "rank_dropout": 0.0, + "module_dropout": 0.0, + "use_effective_conv2d": false, + "decompose_both": false, + "decompose_factor": -1, + "rank_dropout_scale": false, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 13173683073, + "accelerator_memory_max": 23565697024, + "accelerator_memory_reserved_99th": 18987698094, + "train_time": 2152.0406475960117, + "file_size": 1131984, + "num_trainable_params": 279552, + "num_total_params": 3213029376, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.2610720434188842, + "train samples": 1000, + "train time": 43.70352009194903, + "eval time": 12.492729608995432, + "tokens / sec": 4844.438149479918, + "mem allocated avg": 6786119440.384, + "mem reserved avg": 13227744296.96, + "elapsed time": 111.06800683099573 + }, + { + "step": 500, + "valid accuracy": 0.32, + "train loss": 0.9418708410263061, + "train samples": 2000, + "train time": 42.27786245904281, + "eval time": 12.404362346002017, + "tokens / sec": 4919.714193249426, + "mem allocated avg": 6777965645.824, + "mem reserved avg": 13119581585.408, + "elapsed time": 215.0703402069994 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.7932645809650422, + "train samples": 3000, + "train time": 42.92248660406767, + "eval time": 12.39727676199982, + "tokens / sec": 4995.0740733571965, + "mem allocated avg": 6788190218.24, + "mem reserved avg": 13280533807.104, + "elapsed time": 319.9520932419982 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.7486661098003388, + "train samples": 4000, + "train time": 42.81222543502372, + "eval time": 12.462298920996545, + "tokens / sec": 4866.273544135012, + "mem allocated avg": 6778714585.088, + "mem reserved avg": 13136702734.336, + "elapsed time": 424.5020112219936 + }, + { + "step": 1250, + "valid accuracy": 0.3, + "train loss": 0.7329869548082352, + "train samples": 5000, + "train time": 42.917129570938414, + "eval time": 12.430814264000219, + "tokens / sec": 4859.085453403965, + "mem allocated avg": 6779904688.128, + "mem reserved avg": 13134379089.92, + "elapsed time": 529.3957975729936 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.7207228287458419, + "train samples": 6000, + "train time": 43.03383123301319, + "eval time": 12.441326129999652, + "tokens / sec": 4864.335663412017, + "mem allocated avg": 6779916724.224, + "mem reserved avg": 13160828370.944, + "elapsed time": 634.2579850239999 + }, + { + "step": 1750, + "valid accuracy": 0.34, + "train loss": 0.7103905143737793, + "train samples": 7000, + "train time": 42.76188673896104, + "eval time": 12.393813144997694, + "tokens / sec": 4895.831684836612, + "mem allocated avg": 6782196824.064, + "mem reserved avg": 13176313741.312, + "elapsed time": 738.7873818459993 + }, + { + "step": 2000, + "valid accuracy": 0.34, + "train loss": 0.709170572757721, + "train samples": 8000, + "train time": 42.39193291300762, + "eval time": 12.418300639998051, + "tokens / sec": 4899.422737486692, + "mem allocated avg": 6778828410.88, + "mem reserved avg": 13120303005.696, + "elapsed time": 842.8626621349977 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.7017016235589981, + "train samples": 9000, + "train time": 43.90131158899749, + "eval time": 12.417865242998232, + "tokens / sec": 4896.163513571884, + "mem allocated avg": 6790482182.144, + "mem reserved avg": 13307486404.608, + "elapsed time": 949.0399838449957 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.6999357705116271, + "train samples": 10000, + "train time": 41.90174934701645, + "eval time": 7.302261034004914, + "tokens / sec": 4915.474967268057, + "mem allocated avg": 6775840593.92, + "mem reserved avg": 13059930193.92, + "elapsed time": 1047.236226049994 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.694103113770485, + "train samples": 11000, + "train time": 43.541668150042824, + "eval time": 12.415209386999777, + "tokens / sec": 4866.166341396629, + "mem allocated avg": 6786276190.208, + "mem reserved avg": 13245360373.76, + "elapsed time": 1152.8413292650002 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.686756227850914, + "train samples": 12000, + "train time": 43.03442109594471, + "eval time": 7.144659414996568, + "tokens / sec": 4850.32666140987, + "mem allocated avg": 6781073500.16, + "mem reserved avg": 13155426107.392, + "elapsed time": 1252.4253450399992 + }, + { + "step": 3250, + "valid accuracy": 0.38, + "train loss": 0.6960614495277405, + "train samples": 13000, + "train time": 43.27108911598771, + "eval time": 7.294012983998982, + "tokens / sec": 4873.947115929577, + "mem allocated avg": 6783027929.088, + "mem reserved avg": 13189282529.28, + "elapsed time": 1352.4968370129936 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6833453825712203, + "train samples": 14000, + "train time": 43.27389094301907, + "eval time": 7.8778488079988165, + "tokens / sec": 4847.03352134867, + "mem allocated avg": 6781185138.688, + "mem reserved avg": 13163252678.656, + "elapsed time": 1453.3269581979985 + }, + { + "step": 3750, + "valid accuracy": 0.38, + "train loss": 0.6804633007049561, + "train samples": 15000, + "train time": 44.034181773953605, + "eval time": 7.058443625996006, + "tokens / sec": 4921.245070759568, + "mem allocated avg": 6792035817.472, + "mem reserved avg": 13346417934.336, + "elapsed time": 1554.5450010249988 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.6990108703374862, + "train samples": 16000, + "train time": 42.4217994650171, + "eval time": 7.152937473998463, + "tokens / sec": 4817.6409906547, + "mem allocated avg": 6773882982.4, + "mem reserved avg": 13037876543.488, + "elapsed time": 1653.5999729619944 + }, + { + "step": 4250, + "valid accuracy": 0.4, + "train loss": 0.6789947774410248, + "train samples": 17000, + "train time": 43.347477565999725, + "eval time": 7.062385851000727, + "tokens / sec": 4876.615938681662, + "mem allocated avg": 6784096524.288, + "mem reserved avg": 13200808476.672, + "elapsed time": 1754.0244643159967 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.6891120710372924, + "train samples": 18000, + "train time": 42.82309688109672, + "eval time": 7.127946433000034, + "tokens / sec": 4852.941873331364, + "mem allocated avg": 6779667169.28, + "mem reserved avg": 13117727703.04, + "elapsed time": 1853.5914762979955 + }, + { + "step": 4750, + "valid accuracy": 0.38, + "train loss": 0.6815101335048676, + "train samples": 19000, + "train time": 43.26974187397718, + "eval time": 7.069867040001554, + "tokens / sec": 4851.866244347975, + "mem allocated avg": 6780809029.632, + "mem reserved avg": 13175080615.936, + "elapsed time": 1954.106976348994 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.6876721383333206, + "train samples": 20000, + "train time": 43.18727576800302, + "eval time": 7.149106363998726, + "tokens / sec": 4822.716790909798, + "mem allocated avg": 6778227302.4, + "mem reserved avg": 13118625284.096, + "elapsed time": 2054.7161079569996 + }, + { + "step": 5000, + "test accuracy": 0.3752843062926459, + "train loss": 0.6876721383333206, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json new file mode 100644 index 0000000000000000000000000000000000000000..a2b645db9b84e0c51f55b54d3d912b9af5ce63d4 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-dora.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T18:37:24+00:00", + "total_time": 2286.5437473089987, + "experiment_name": "lora/llama-3.2-3B-rank32-dora", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": true, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12490471636, + "accelerator_memory_max": 24553455616, + "accelerator_memory_reserved_99th": 19189150515, + "train_time": 2022.7454924520134, + "file_size": 37181760, + "num_trainable_params": 9289728, + "num_total_params": 3222039552, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.9800839998722076, + "train samples": 1000, + "train time": 35.42731901501611, + "eval time": 16.70931195599769, + "tokens / sec": 5976.150775345474, + "mem allocated avg": 6924859500.544, + "mem reserved avg": 12552201306.112, + "elapsed time": 105.33911871900273 + }, + { + "step": 500, + "valid accuracy": 0.44, + "train loss": 0.7162023800611496, + "train samples": 2000, + "train time": 35.53461015297944, + "eval time": 16.7331051809997, + "tokens / sec": 5853.307496678993, + "mem allocated avg": 6917484427.264, + "mem reserved avg": 12427118772.224, + "elapsed time": 204.02196035100133 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6790966511964798, + "train samples": 3000, + "train time": 35.395415813978616, + "eval time": 10.35499690800134, + "tokens / sec": 6057.309825848329, + "mem allocated avg": 6927996166.144, + "mem reserved avg": 12609050902.528, + "elapsed time": 296.3724143870022 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6590274780988693, + "train samples": 4000, + "train time": 35.01134122798976, + "eval time": 16.638093278997985, + "tokens / sec": 5950.528962696411, + "mem allocated avg": 6919690883.072, + "mem reserved avg": 12464313860.096, + "elapsed time": 394.33112582000103 + }, + { + "step": 1250, + "valid accuracy": 0.42, + "train loss": 0.6542477097511291, + "train samples": 5000, + "train time": 34.85555366096378, + "eval time": 16.627405782997812, + "tokens / sec": 5982.920312453697, + "mem allocated avg": 6919055253.504, + "mem reserved avg": 12449952563.2, + "elapsed time": 492.1167898590029 + }, + { + "step": 1500, + "valid accuracy": 0.4, + "train loss": 0.6471435966491699, + "train samples": 6000, + "train time": 35.407848127983016, + "eval time": 10.318167828998412, + "tokens / sec": 5911.994404273457, + "mem allocated avg": 6921185224.704, + "mem reserved avg": 12477500751.872, + "elapsed time": 584.325913470002 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6376023133993148, + "train samples": 7000, + "train time": 35.61810469696138, + "eval time": 10.057756549002079, + "tokens / sec": 5877.7692350896, + "mem allocated avg": 6922196224.0, + "mem reserved avg": 12495888580.608, + "elapsed time": 676.5556904380028 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6404745506048203, + "train samples": 8000, + "train time": 35.01814225999988, + "eval time": 10.846777078000741, + "tokens / sec": 5931.097042724754, + "mem allocated avg": 6919877345.28, + "mem reserved avg": 12428771328.0, + "elapsed time": 768.7593245980024 + }, + { + "step": 2250, + "valid accuracy": 0.48, + "train loss": 0.6327905882596969, + "train samples": 9000, + "train time": 35.941867801058834, + "eval time": 16.654083295998134, + "tokens / sec": 5980.434884178939, + "mem allocated avg": 6930785019.904, + "mem reserved avg": 12637135962.112, + "elapsed time": 868.0876048490027 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6293514591455459, + "train samples": 10000, + "train time": 35.19044898093853, + "eval time": 16.654415837998386, + "tokens / sec": 5852.923334725435, + "mem allocated avg": 6914962546.688, + "mem reserved avg": 12361175924.736, + "elapsed time": 965.9673830700012 + }, + { + "step": 2750, + "valid accuracy": 0.34, + "train loss": 0.6212090995311738, + "train samples": 11000, + "train time": 35.78923041201051, + "eval time": 12.364532577001228, + "tokens / sec": 5920.2446535116005, + "mem allocated avg": 6926067247.104, + "mem reserved avg": 12561110007.808, + "elapsed time": 1060.7434992320013 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.6132309092283249, + "train samples": 12000, + "train time": 35.434680095979274, + "eval time": 10.902270734000922, + "tokens / sec": 5890.585139604081, + "mem allocated avg": 6921261266.944, + "mem reserved avg": 12472811520.0, + "elapsed time": 1153.3681941970026 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6223928620815277, + "train samples": 13000, + "train time": 35.475069620017166, + "eval time": 9.885322058999009, + "tokens / sec": 5945.048234126565, + "mem allocated avg": 6922737405.952, + "mem reserved avg": 12498002509.824, + "elapsed time": 1245.241280964001 + }, + { + "step": 3500, + "valid accuracy": 0.5, + "train loss": 0.605602259516716, + "train samples": 14000, + "train time": 35.607162244014035, + "eval time": 10.090815307001321, + "tokens / sec": 5890.668808780496, + "mem allocated avg": 6920974434.304, + "mem reserved avg": 12474329858.048, + "elapsed time": 1337.4724736530006 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.6031041693687439, + "train samples": 15000, + "train time": 36.209776319014054, + "eval time": 10.371932055000798, + "tokens / sec": 5984.6544781390285, + "mem allocated avg": 6933558140.928, + "mem reserved avg": 12681738190.848, + "elapsed time": 1431.2058649130013 + }, + { + "step": 4000, + "valid accuracy": 0.46, + "train loss": 0.6162525477409363, + "train samples": 16000, + "train time": 35.48366187599095, + "eval time": 12.394127589999698, + "tokens / sec": 5759.636666425441, + "mem allocated avg": 6914222096.384, + "mem reserved avg": 12349406707.712, + "elapsed time": 1525.3134414390006 + }, + { + "step": 4250, + "valid accuracy": 0.5, + "train loss": 0.6013483003377914, + "train samples": 17000, + "train time": 35.15769277801883, + "eval time": 16.63699178299794, + "tokens / sec": 6012.59591562743, + "mem allocated avg": 6924507731.968, + "mem reserved avg": 12521616441.344, + "elapsed time": 1623.6120678540028 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.6073888168334961, + "train samples": 18000, + "train time": 34.98748015804085, + "eval time": 12.561758541996824, + "tokens / sec": 5939.781860861995, + "mem allocated avg": 6918951696.384, + "mem reserved avg": 12432495869.952, + "elapsed time": 1717.352138276001 + }, + { + "step": 4750, + "valid accuracy": 0.5, + "train loss": 0.5993685643672944, + "train samples": 19000, + "train time": 35.57701125005042, + "eval time": 13.379837485997996, + "tokens / sec": 5900.973483254653, + "mem allocated avg": 6921678901.248, + "mem reserved avg": 12490880581.632, + "elapsed time": 1812.886111721 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.6068210340738297, + "train samples": 20000, + "train time": 35.678432397002325, + "eval time": 10.087769599998865, + "tokens / sec": 5837.700425916121, + "mem allocated avg": 6918288025.6, + "mem reserved avg": 12423931101.184, + "elapsed time": 1905.221841073002 + }, + { + "step": 5000, + "test accuracy": 0.4806671721000758, + "train loss": 0.6068210340738297, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json new file mode 100644 index 0000000000000000000000000000000000000000..35c7a9981da710c1f025c69644ff65550b1e0edc --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32-lorafa.json @@ -0,0 +1,367 @@ +{ + "run_info": { + "created_at": "2025-06-19T17:29:01+00:00", + "total_time": 2025.9028512089972, + "experiment_name": "lora/llama-3.2-3B-rank32-lorafa", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "lora-fa", + "optimizer_kwargs": { + "r": 32, + "lora_alpha": 64, + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11106307276, + "accelerator_memory_max": 20187185152, + "accelerator_memory_reserved_99th": 16257394933, + "train_time": 1821.1390361119993, + "file_size": 36715216, + "num_trainable_params": 3670016, + "num_total_params": 3221924864, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.26, + "train loss": 1.13827001953125, + "train samples": 1000, + "train time": 39.487167649953335, + "eval time": 11.352047874999698, + "tokens / sec": 5361.716542367662, + "mem allocated avg": 6857574733.824, + "mem reserved avg": 11147042357.248, + "elapsed time": 95.33382818899918 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.8058450784683228, + "train samples": 2000, + "train time": 38.91575912100234, + "eval time": 11.306344865999563, + "tokens / sec": 5344.749908469542, + "mem allocated avg": 6850229934.08, + "mem reserved avg": 11051613552.64, + "elapsed time": 184.45338391399855 + }, + { + "step": 750, + "valid accuracy": 0.4, + "train loss": 0.725865609407425, + "train samples": 3000, + "train time": 39.53630301699741, + "eval time": 9.965407437997783, + "tokens / sec": 5422.889436774727, + "mem allocated avg": 6861271248.896, + "mem reserved avg": 11192013684.736, + "elapsed time": 273.0429774479999 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.69585602581501, + "train samples": 4000, + "train time": 38.42195282199464, + "eval time": 11.263002069001232, + "tokens / sec": 5422.316792829388, + "mem allocated avg": 6851626665.984, + "mem reserved avg": 11074279571.456, + "elapsed time": 361.40852819099746 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6884716705083848, + "train samples": 5000, + "train time": 38.42177955799343, + "eval time": 11.356440440998995, + "tokens / sec": 5427.598679682052, + "mem allocated avg": 6851712622.592, + "mem reserved avg": 11075865018.368, + "elapsed time": 449.83568274799836 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.6801862429380416, + "train samples": 6000, + "train time": 38.768619330003276, + "eval time": 11.348457601998234, + "tokens / sec": 5399.495871084515, + "mem allocated avg": 6853521098.752, + "mem reserved avg": 11096006066.176, + "elapsed time": 538.7312806489972 + }, + { + "step": 1750, + "valid accuracy": 0.38, + "train loss": 0.6713097202777862, + "train samples": 7000, + "train time": 38.99274470796445, + "eval time": 8.222045223003079, + "tokens / sec": 5369.0757490389815, + "mem allocated avg": 6854799144.96, + "mem reserved avg": 11113831858.176, + "elapsed time": 624.957832287997 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6733613710403442, + "train samples": 8000, + "train time": 38.96502619797684, + "eval time": 9.028824541001086, + "tokens / sec": 5330.318500101101, + "mem allocated avg": 6852199981.056, + "mem reserved avg": 11058584485.888, + "elapsed time": 711.7122244169987 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.6658626307249069, + "train samples": 9000, + "train time": 39.83998639498168, + "eval time": 11.38518134900005, + "tokens / sec": 5395.282966940854, + "mem allocated avg": 6862685554.688, + "mem reserved avg": 11223865229.312, + "elapsed time": 802.4390404449987 + }, + { + "step": 2500, + "valid accuracy": 0.38, + "train loss": 0.6645791643857956, + "train samples": 10000, + "train time": 38.493957691986, + "eval time": 11.311897349998617, + "tokens / sec": 5350.631952372099, + "mem allocated avg": 6848127772.672, + "mem reserved avg": 11012925292.544, + "elapsed time": 890.7464078919984 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.658472005367279, + "train samples": 11000, + "train time": 38.51331885699619, + "eval time": 7.521690310000849, + "tokens / sec": 5501.499384842303, + "mem allocated avg": 6858912532.48, + "mem reserved avg": 11161915359.232, + "elapsed time": 975.6010923279973 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6503657740354538, + "train samples": 12000, + "train time": 38.378428091957176, + "eval time": 9.959380172000238, + "tokens / sec": 5438.758447841249, + "mem allocated avg": 6853735892.992, + "mem reserved avg": 11091962757.12, + "elapsed time": 1062.3718837759989 + }, + { + "step": 3250, + "valid accuracy": 0.48, + "train loss": 0.6599743469953537, + "train samples": 13000, + "train time": 38.74303203701493, + "eval time": 9.720565422001528, + "tokens / sec": 5443.585308411229, + "mem allocated avg": 6855708461.056, + "mem reserved avg": 11117246021.632, + "elapsed time": 1149.5592005079998 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6468936309814454, + "train samples": 14000, + "train time": 38.947772975978296, + "eval time": 10.49309463499958, + "tokens / sec": 5385.417033455723, + "mem allocated avg": 6854553325.568, + "mem reserved avg": 11102364631.04, + "elapsed time": 1237.83959684 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.6447412570714951, + "train samples": 15000, + "train time": 39.208677324022574, + "eval time": 11.265130790001422, + "tokens / sec": 5526.914315654032, + "mem allocated avg": 6864447199.232, + "mem reserved avg": 11258292076.544, + "elapsed time": 1327.6204509749987 + }, + { + "step": 4000, + "valid accuracy": 0.48, + "train loss": 0.6609537017345428, + "train samples": 16000, + "train time": 38.373366451996844, + "eval time": 8.435534727999766, + "tokens / sec": 5325.907495128434, + "mem allocated avg": 6846769313.792, + "mem reserved avg": 10994319360.0, + "elapsed time": 1412.8209538019983 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6430994077920914, + "train samples": 17000, + "train time": 38.840016363014, + "eval time": 8.356262703997345, + "tokens / sec": 5442.556924391474, + "mem allocated avg": 6857134465.024, + "mem reserved avg": 11130768457.728, + "elapsed time": 1498.7970963499974 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.6519971441030502, + "train samples": 18000, + "train time": 38.99225058195225, + "eval time": 9.193580140999984, + "tokens / sec": 5329.725699295479, + "mem allocated avg": 6851737821.184, + "mem reserved avg": 11062996893.696, + "elapsed time": 1585.3292836179971 + }, + { + "step": 4750, + "valid accuracy": 0.42, + "train loss": 0.6448501836061478, + "train samples": 19000, + "train time": 39.31173135296194, + "eval time": 8.482506923999608, + "tokens / sec": 5340.365147366681, + "mem allocated avg": 6853984409.6, + "mem reserved avg": 11104352731.136, + "elapsed time": 1672.2648903240006 + }, + { + "step": 5000, + "valid accuracy": 0.46, + "train loss": 0.6509636770486832, + "train samples": 20000, + "train time": 38.96172000500519, + "eval time": 11.401191647000815, + "tokens / sec": 5345.759888763726, + "mem allocated avg": 6850959237.12, + "mem reserved avg": 11055900131.328, + "elapsed time": 1761.553419697997 + }, + { + "step": 5000, + "test accuracy": 0.42987111448066717, + "train loss": 0.6509636770486832, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32.json b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d348c07ec5957e5cdaad74b5c6d9071763afb1 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank32.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T19:15:35+00:00", + "total_time": 1993.494420946001, + "experiment_name": "lora/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11868689976, + "accelerator_memory_max": 22273851392, + "accelerator_memory_reserved_99th": 17710763212, + "train_time": 1796.1857790169925, + "file_size": 36715216, + "num_trainable_params": 9175040, + "num_total_params": 3221924864, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 0.9827028260231018, + "train samples": 1000, + "train time": 31.395267726013117, + "eval time": 11.27943390099972, + "tokens / sec": 6743.659644748829, + "mem allocated avg": 6925580957.696, + "mem reserved avg": 11920245522.432, + "elapsed time": 94.68654379600048 + }, + { + "step": 500, + "valid accuracy": 0.44, + "train loss": 0.7164744178056717, + "train samples": 2000, + "train time": 30.728173206967767, + "eval time": 11.244831023999723, + "tokens / sec": 6768.869681873444, + "mem allocated avg": 6918363699.2, + "mem reserved avg": 11811654991.872, + "elapsed time": 182.6767855429971 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6791989279985428, + "train samples": 3000, + "train time": 31.248708018982143, + "eval time": 6.873092081001232, + "tokens / sec": 6861.115661798283, + "mem allocated avg": 6929003134.976, + "mem reserved avg": 11970174517.248, + "elapsed time": 267.2763524209986 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.6590347054004669, + "train samples": 4000, + "train time": 31.016855426081747, + "eval time": 7.663122134003061, + "tokens / sec": 6716.864012746194, + "mem allocated avg": 6919503566.848, + "mem reserved avg": 11835008876.544, + "elapsed time": 351.92747904299904 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6547032891511917, + "train samples": 5000, + "train time": 30.914218463025463, + "eval time": 11.249955232000502, + "tokens / sec": 6745.698593332356, + "mem allocated avg": 6919763681.28, + "mem reserved avg": 11832551014.4, + "elapsed time": 440.29597954699784 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.647298491358757, + "train samples": 6000, + "train time": 31.093457819981268, + "eval time": 11.25276822899832, + "tokens / sec": 6732.316528188762, + "mem allocated avg": 6920362313.728, + "mem reserved avg": 11859000295.424, + "elapsed time": 529.2981231249978 + }, + { + "step": 1750, + "valid accuracy": 0.46, + "train loss": 0.6378061240911483, + "train samples": 7000, + "train time": 31.079548971014447, + "eval time": 11.2527706639994, + "tokens / sec": 6736.101614449091, + "mem allocated avg": 6922653980.672, + "mem reserved avg": 11870048092.16, + "elapsed time": 617.7172930779998 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.641120473742485, + "train samples": 8000, + "train time": 30.851661891996628, + "eval time": 7.384566520999215, + "tokens / sec": 6732.084667823985, + "mem allocated avg": 6919747647.488, + "mem reserved avg": 11816562327.552, + "elapsed time": 702.0775224069985 + }, + { + "step": 2250, + "valid accuracy": 0.46, + "train loss": 0.6332860335111618, + "train samples": 9000, + "train time": 31.288193090975255, + "eval time": 11.258606130999397, + "tokens / sec": 6869.939704571801, + "mem allocated avg": 6930711803.904, + "mem reserved avg": 12003997384.704, + "elapsed time": 791.1291831710005 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6298432033061981, + "train samples": 10000, + "train time": 30.668521790059458, + "eval time": 11.22552015600013, + "tokens / sec": 6715.908950876132, + "mem allocated avg": 6916224055.296, + "mem reserved avg": 11759050031.104, + "elapsed time": 878.9048607999976 + }, + { + "step": 2750, + "valid accuracy": 0.4, + "train loss": 0.6213459351062774, + "train samples": 11000, + "train time": 31.198134894020768, + "eval time": 7.820672179997928, + "tokens / sec": 6791.463679471677, + "mem allocated avg": 6926273599.488, + "mem reserved avg": 11930135691.264, + "elapsed time": 964.4001106439973 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.6136174714565277, + "train samples": 12000, + "train time": 30.652901480014407, + "eval time": 8.59450396900138, + "tokens / sec": 6809.502197894445, + "mem allocated avg": 6921910312.96, + "mem reserved avg": 11851509268.48, + "elapsed time": 1049.6233134680006 + }, + { + "step": 3250, + "valid accuracy": 0.46, + "train loss": 0.6227310271263122, + "train samples": 13000, + "train time": 30.898520497004938, + "eval time": 11.247846516002028, + "tokens / sec": 6825.601893153528, + "mem allocated avg": 6923552774.144, + "mem reserved avg": 11884266782.72, + "elapsed time": 1137.9473550990006 + }, + { + "step": 3500, + "valid accuracy": 0.52, + "train loss": 0.6058980323076248, + "train samples": 14000, + "train time": 31.043968706952, + "eval time": 7.071496761000162, + "tokens / sec": 6756.545916535101, + "mem allocated avg": 6922457063.424, + "mem reserved avg": 11865602129.92, + "elapsed time": 1222.3722963839973 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.6032638043165207, + "train samples": 15000, + "train time": 31.41906641800597, + "eval time": 6.834270917999675, + "tokens / sec": 6897.18138397039, + "mem allocated avg": 6932064409.6, + "mem reserved avg": 12041553182.72, + "elapsed time": 1307.517348808 + }, + { + "step": 4000, + "valid accuracy": 0.48, + "train loss": 0.6166473155021668, + "train samples": 16000, + "train time": 30.82234557695483, + "eval time": 6.627715251001064, + "tokens / sec": 6630.676419149782, + "mem allocated avg": 6914480900.096, + "mem reserved avg": 11738338557.952, + "elapsed time": 1390.9289551410002 + }, + { + "step": 4250, + "valid accuracy": 0.44, + "train loss": 0.601645546555519, + "train samples": 17000, + "train time": 30.811621871023817, + "eval time": 11.241402788000414, + "tokens / sec": 6860.690452611215, + "mem allocated avg": 6925075550.208, + "mem reserved avg": 11899366277.12, + "elapsed time": 1479.325017957999 + }, + { + "step": 4500, + "valid accuracy": 0.46, + "train loss": 0.6076700875759125, + "train samples": 18000, + "train time": 30.499847401017178, + "eval time": 11.232504903000518, + "tokens / sec": 6813.73900884072, + "mem allocated avg": 6919328847.872, + "mem reserved avg": 11814020579.328, + "elapsed time": 1567.0791362639975 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.5997640329599381, + "train samples": 19000, + "train time": 30.974938084971654, + "eval time": 11.246996836001927, + "tokens / sec": 6777.705234602477, + "mem allocated avg": 6921498724.352, + "mem reserved avg": 11864662605.824, + "elapsed time": 1655.6881185989987 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.6069052599668503, + "train samples": 20000, + "train time": 30.736502733019734, + "eval time": 11.28520023999954, + "tokens / sec": 6776.307695418065, + "mem allocated avg": 6918408683.52, + "mem reserved avg": 11806051401.728, + "elapsed time": 1743.785376189 + }, + { + "step": 5000, + "test accuracy": 0.48218347232752085, + "train loss": 0.6069052599668503, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json new file mode 100644 index 0000000000000000000000000000000000000000..dbeb788d4e2455d909eca39139320edbbf9fd173 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64-rslora.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T18:02:52+00:00", + "total_time": 2068.5078051540004, + "experiment_name": "lora/llama-3.2-3B-rank64-rslora", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "lora_alpha": 64, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": true, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12128059444, + "accelerator_memory_max": 22538092544, + "accelerator_memory_reserved_99th": 17953927987, + "train_time": 1871.457509397991, + "file_size": 73415408, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.8666043817996979, + "train samples": 1000, + "train time": 31.633613975991466, + "eval time": 11.220254406001914, + "tokens / sec": 6692.848947347132, + "mem allocated avg": 7072427177.984, + "mem reserved avg": 12177985503.232, + "elapsed time": 97.06891104899842 + }, + { + "step": 500, + "valid accuracy": 0.32, + "train loss": 0.697043846487999, + "train samples": 2000, + "train time": 31.400947067988454, + "eval time": 11.24747418500192, + "tokens / sec": 6623.844801548661, + "mem allocated avg": 7064966957.056, + "mem reserved avg": 12070787481.6, + "elapsed time": 188.02626212299947 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6723507121801376, + "train samples": 3000, + "train time": 31.849995732016396, + "eval time": 11.249978227999236, + "tokens / sec": 6731.586459350098, + "mem allocated avg": 7075822055.424, + "mem reserved avg": 12225037205.504, + "elapsed time": 280.16055655299715 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.6529685587882995, + "train samples": 4000, + "train time": 31.612207354013663, + "eval time": 11.24677863100078, + "tokens / sec": 6590.365477074112, + "mem allocated avg": 7066256992.256, + "mem reserved avg": 12092287483.904, + "elapsed time": 371.5183315869981 + }, + { + "step": 1250, + "valid accuracy": 0.38, + "train loss": 0.6472815409898758, + "train samples": 5000, + "train time": 31.26670297003875, + "eval time": 8.06907803500144, + "tokens / sec": 6669.651104557813, + "mem allocated avg": 7066435080.192, + "mem reserved avg": 12087824744.448, + "elapsed time": 459.33407214199906 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6395461517572403, + "train samples": 6000, + "train time": 31.471468601008382, + "eval time": 6.4898526670003776, + "tokens / sec": 6651.4531830043925, + "mem allocated avg": 7067292080.128, + "mem reserved avg": 12121664389.12, + "elapsed time": 545.9371380269986 + }, + { + "step": 1750, + "valid accuracy": 0.5, + "train loss": 0.629749027967453, + "train samples": 7000, + "train time": 31.650018079009897, + "eval time": 11.247470542999508, + "tokens / sec": 6614.688164707337, + "mem allocated avg": 7069213276.16, + "mem reserved avg": 12130329821.184, + "elapsed time": 637.2524904149977 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.6293291836977005, + "train samples": 8000, + "train time": 31.45956211398152, + "eval time": 11.187045163998846, + "tokens / sec": 6601.999075749819, + "mem allocated avg": 7066587928.576, + "mem reserved avg": 12076634341.376, + "elapsed time": 728.2233991199973 + }, + { + "step": 2250, + "valid accuracy": 0.4, + "train loss": 0.6171289530992508, + "train samples": 9000, + "train time": 31.87981533700804, + "eval time": 6.866186073002609, + "tokens / sec": 6742.448089104055, + "mem allocated avg": 7077788481.536, + "mem reserved avg": 12265227026.432, + "elapsed time": 815.9717469499992 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6119417071342468, + "train samples": 10000, + "train time": 31.067599171023176, + "eval time": 10.55572699700133, + "tokens / sec": 6629.640058962326, + "mem allocated avg": 7062992943.104, + "mem reserved avg": 12015850487.808, + "elapsed time": 905.6140671029971 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.5985908216238022, + "train samples": 11000, + "train time": 31.864849751029396, + "eval time": 6.1964339680016565, + "tokens / sec": 6649.364477017663, + "mem allocated avg": 7072847513.6, + "mem reserved avg": 12192229359.616, + "elapsed time": 992.5055643979977 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.5865949945449829, + "train samples": 12000, + "train time": 31.337576934987737, + "eval time": 7.105518241998652, + "tokens / sec": 6660.725570232467, + "mem allocated avg": 7068560369.664, + "mem reserved avg": 12111589670.912, + "elapsed time": 1079.4884613180002 + }, + { + "step": 3250, + "valid accuracy": 0.56, + "train loss": 0.5926763614416123, + "train samples": 13000, + "train time": 31.477448584984813, + "eval time": 11.220603736997873, + "tokens / sec": 6700.066539084199, + "mem allocated avg": 7070318487.552, + "mem reserved avg": 12143046950.912, + "elapsed time": 1171.2174267509981 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5736529529094696, + "train samples": 14000, + "train time": 31.59231336902303, + "eval time": 11.215632880001067, + "tokens / sec": 6639.273216556042, + "mem allocated avg": 7068978995.2, + "mem reserved avg": 12124969500.672, + "elapsed time": 1263.2709914519983 + }, + { + "step": 3750, + "valid accuracy": 0.54, + "train loss": 0.5691816571950913, + "train samples": 15000, + "train time": 31.92663248500321, + "eval time": 6.89942428699942, + "tokens / sec": 6787.530758271833, + "mem allocated avg": 7079011248.128, + "mem reserved avg": 12298890510.336, + "elapsed time": 1351.894684084 + }, + { + "step": 4000, + "valid accuracy": 0.56, + "train loss": 0.5762648656368256, + "train samples": 16000, + "train time": 31.08475098094641, + "eval time": 6.668889390999539, + "tokens / sec": 6574.7028221416895, + "mem allocated avg": 7061224237.056, + "mem reserved avg": 12000969097.216, + "elapsed time": 1437.9300296759975 + }, + { + "step": 4250, + "valid accuracy": 0.52, + "train loss": 0.562865238904953, + "train samples": 17000, + "train time": 31.594970259979164, + "eval time": 11.218020562002494, + "tokens / sec": 6690.590250935068, + "mem allocated avg": 7071853715.456, + "mem reserved avg": 12157852844.032, + "elapsed time": 1529.7590418299988 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.568256908416748, + "train samples": 18000, + "train time": 31.334908029966755, + "eval time": 11.24515695700029, + "tokens / sec": 6632.156054240077, + "mem allocated avg": 7066128418.816, + "mem reserved avg": 12073245343.744, + "elapsed time": 1620.988001589998 + }, + { + "step": 4750, + "valid accuracy": 0.52, + "train loss": 0.5585172891616821, + "train samples": 19000, + "train time": 31.425996138961636, + "eval time": 11.202903266999783, + "tokens / sec": 6680.4246736261675, + "mem allocated avg": 7068498065.408, + "mem reserved avg": 12124491350.016, + "elapsed time": 1712.544705993998 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5657225311994553, + "train samples": 20000, + "train time": 31.088545892969705, + "eval time": 11.224285021999094, + "tokens / sec": 6699.573557317776, + "mem allocated avg": 7064964919.296, + "mem reserved avg": 12070275776.512, + "elapsed time": 1803.3861051699969 + }, + { + "step": 5000, + "test accuracy": 0.5299469294920395, + "train loss": 0.5657225311994553, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64.json b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64.json new file mode 100644 index 0000000000000000000000000000000000000000..5dd3d50954761111762cc831de93cf4f76995044 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/lora--llama-3.2-3B-rank64.json @@ -0,0 +1,365 @@ +{ + "run_info": { + "created_at": "2025-06-19T16:55:20+00:00", + "total_time": 2017.2277705579982, + "experiment_name": "lora/llama-3.2-3B-rank64", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "LORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "lora_alpha": 128, + "lora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "use_rslora": false, + "modules_to_save": null, + "init_lora_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "rank_pattern": {}, + "alpha_pattern": {}, + "megatron_config": null, + "megatron_core": "megatron.core", + "trainable_token_indices": null, + "loftq_config": {}, + "eva_config": null, + "corda_config": null, + "use_dora": false, + "layer_replication": null, + "lora_bias": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12128055669, + "accelerator_memory_max": 22540189696, + "accelerator_memory_reserved_99th": 17953927987, + "train_time": 1853.4967184819961, + "file_size": 73415408, + "num_trainable_params": 18350080, + "num_total_params": 3231099904, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.38, + "train loss": 0.9425119986534118, + "train samples": 1000, + "train time": 31.823601707994385, + "eval time": 11.233909951999522, + "tokens / sec": 6652.892464614218, + "mem allocated avg": 7072427177.984, + "mem reserved avg": 12177985503.232, + "elapsed time": 97.04379223199976 + }, + { + "step": 500, + "valid accuracy": 0.4, + "train loss": 0.7080548154115677, + "train samples": 2000, + "train time": 31.45184341498316, + "eval time": 8.232533225000225, + "tokens / sec": 6613.125890767804, + "mem allocated avg": 7065105152.0, + "mem reserved avg": 12072179990.528, + "elapsed time": 184.765658884 + }, + { + "step": 750, + "valid accuracy": 0.48, + "train loss": 0.6735224899053573, + "train samples": 3000, + "train time": 31.813968455000577, + "eval time": 7.057446101998721, + "tokens / sec": 6739.20954888921, + "mem allocated avg": 7075631579.136, + "mem reserved avg": 12224064126.976, + "elapsed time": 272.18349517599927 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.6520720717906952, + "train samples": 4000, + "train time": 31.539530114994704, + "eval time": 6.8677342959999805, + "tokens / sec": 6605.551802464924, + "mem allocated avg": 7066230261.76, + "mem reserved avg": 12094502076.416, + "elapsed time": 358.6604399049993 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.6483409875631332, + "train samples": 5000, + "train time": 31.15382274799049, + "eval time": 6.63156994000019, + "tokens / sec": 6693.817374737786, + "mem allocated avg": 7066402795.52, + "mem reserved avg": 12090886586.368, + "elapsed time": 444.47985113600043 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6400664356946946, + "train samples": 6000, + "train time": 31.237405868998394, + "eval time": 6.19883855199987, + "tokens / sec": 6701.292702661678, + "mem allocated avg": 7067143219.2, + "mem reserved avg": 12125288267.776, + "elapsed time": 529.970450933999 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6309183040857315, + "train samples": 7000, + "train time": 31.58418034899296, + "eval time": 11.217398733000664, + "tokens / sec": 6628.476588175104, + "mem allocated avg": 7069430339.584, + "mem reserved avg": 12128735985.664, + "elapsed time": 620.7944932609989 + }, + { + "step": 2000, + "valid accuracy": 0.38, + "train loss": 0.6333342634439468, + "train samples": 8000, + "train time": 31.370570010996744, + "eval time": 11.2056582969999, + "tokens / sec": 6620.727641454827, + "mem allocated avg": 7066754975.744, + "mem reserved avg": 12075980029.952, + "elapsed time": 711.143019907 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6244297958612443, + "train samples": 9000, + "train time": 32.090800706988375, + "eval time": 6.320641570999214, + "tokens / sec": 6698.118939524966, + "mem allocated avg": 7077559773.184, + "mem reserved avg": 12266535649.28, + "elapsed time": 798.2718276069991 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6205919095277787, + "train samples": 10000, + "train time": 31.211024427002485, + "eval time": 7.8215817759992206, + "tokens / sec": 6599.1746115775, + "mem allocated avg": 7063100512.256, + "mem reserved avg": 12017771479.04, + "elapsed time": 885.0132823740005 + }, + { + "step": 2750, + "valid accuracy": 0.38, + "train loss": 0.6116842222213745, + "train samples": 11000, + "train time": 31.752687646014238, + "eval time": 11.215984603999459, + "tokens / sec": 6672.852464084136, + "mem allocated avg": 7072850802.688, + "mem reserved avg": 12190207705.088, + "elapsed time": 976.3748101059991 + }, + { + "step": 3000, + "valid accuracy": 0.48, + "train loss": 0.6028307398557663, + "train samples": 12000, + "train time": 31.220882691013685, + "eval time": 10.851913497001078, + "tokens / sec": 6685.621353687066, + "mem allocated avg": 7068516059.136, + "mem reserved avg": 12110624980.992, + "elapsed time": 1066.2028727340003 + }, + { + "step": 3250, + "valid accuracy": 0.54, + "train loss": 0.6109937611818314, + "train samples": 13000, + "train time": 31.23074521200033, + "eval time": 6.857214526000462, + "tokens / sec": 6752.992878279506, + "mem allocated avg": 7070265374.72, + "mem reserved avg": 12142795292.672, + "elapsed time": 1152.2392765660006 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.5937278937101365, + "train samples": 14000, + "train time": 31.52822203695905, + "eval time": 6.510061502001918, + "tokens / sec": 6652.7696916787745, + "mem allocated avg": 7069306679.296, + "mem reserved avg": 12124390686.72, + "elapsed time": 1238.6433643029995 + }, + { + "step": 3750, + "valid accuracy": 0.6, + "train loss": 0.5906780579090118, + "train samples": 15000, + "train time": 32.31397023300451, + "eval time": 8.545268227000633, + "tokens / sec": 6706.170688325575, + "mem allocated avg": 7078981097.472, + "mem reserved avg": 12299846811.648, + "elapsed time": 1328.641089326 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.6025177363157272, + "train samples": 16000, + "train time": 31.170676869962335, + "eval time": 6.8420828330017684, + "tokens / sec": 6556.57882735759, + "mem allocated avg": 7061331572.736, + "mem reserved avg": 12001287864.32, + "elapsed time": 1414.365250592 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5884622411727906, + "train samples": 17000, + "train time": 31.543792515007226, + "eval time": 6.748535185997753, + "tokens / sec": 6701.445297024126, + "mem allocated avg": 7071957172.224, + "mem reserved avg": 12155780857.856, + "elapsed time": 1500.9030026039982 + }, + { + "step": 4500, + "valid accuracy": 0.54, + "train loss": 0.5941844927072525, + "train samples": 18000, + "train time": 31.45958714898734, + "eval time": 6.4977734870008135, + "tokens / sec": 6605.871813123572, + "mem allocated avg": 7066011588.608, + "mem reserved avg": 12069847957.504, + "elapsed time": 1586.8870083309994 + }, + { + "step": 4750, + "valid accuracy": 0.56, + "train loss": 0.5860341912508011, + "train samples": 19000, + "train time": 31.656771414985997, + "eval time": 6.746858504000556, + "tokens / sec": 6631.724923806254, + "mem allocated avg": 7068472178.688, + "mem reserved avg": 12124852060.16, + "elapsed time": 1673.7380427649987 + }, + { + "step": 5000, + "valid accuracy": 0.58, + "train loss": 0.5928755496740341, + "train samples": 20000, + "train time": 31.260896800042246, + "eval time": 6.4877336810022825, + "tokens / sec": 6662.636754545011, + "mem allocated avg": 7065262428.16, + "mem reserved avg": 12067549478.912, + "elapsed time": 1759.6036715839982 + }, + { + "step": 5000, + "test accuracy": 0.4890068233510235, + "train loss": 0.5928755496740341, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-bat.json b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-bat.json new file mode 100644 index 0000000000000000000000000000000000000000..fc788b3e6208118bac0eec843b00fdec8b96288b --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-bat.json @@ -0,0 +1,352 @@ +{ + "run_info": { + "created_at": "2025-08-14T11:55:49+00:00", + "total_time": 2808.721444314, + "experiment_name": "miss/llama-3.2-3B-bat", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "MISS", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "miss_dropout": 0.0, + "mini_r": 1, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "init_weights": "bat", + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 14713719934, + "accelerator_memory_max": 25251807232, + "accelerator_memory_reserved_99th": 20472733368, + "train_time": 2466.149786608999, + "file_size": 29367552, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.32, + "train loss": 0.8741402707099915, + "train samples": 1000, + "train time": 44.507981576001725, + "eval time": 16.603345405999903, + "tokens / sec": 4756.8771376088835, + "mem allocated avg": 6898417197.056, + "mem reserved avg": 14772422574.08, + "elapsed time": 128.87205576299993 + }, + { + "step": 500, + "valid accuracy": 0.42, + "train loss": 0.6949697629213333, + "train samples": 2000, + "train time": 43.6579733309992, + "eval time": 12.170993550999924, + "tokens / sec": 4764.192749467687, + "mem allocated avg": 6890132037.632, + "mem reserved avg": 14662515032.064, + "elapsed time": 244.05737383899998 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.667268633723259, + "train samples": 3000, + "train time": 44.76929137299828, + "eval time": 8.243386759000032, + "tokens / sec": 4789.0192903368525, + "mem allocated avg": 6900972326.912, + "mem reserved avg": 14823525974.016, + "elapsed time": 357.2643382499999 + }, + { + "step": 1000, + "valid accuracy": 0.48, + "train loss": 0.6478440872430802, + "train samples": 4000, + "train time": 43.91589877199954, + "eval time": 9.950706549000074, + "tokens / sec": 4743.976687842116, + "mem allocated avg": 6892131758.08, + "mem reserved avg": 14678444998.656, + "elapsed time": 470.61746281599994 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6435494017601013, + "train samples": 5000, + "train time": 44.14956537599949, + "eval time": 16.547810228000117, + "tokens / sec": 4723.444007296278, + "mem allocated avg": 6892566360.064, + "mem reserved avg": 14674737233.92, + "elapsed time": 591.057877963 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.6368351166248322, + "train samples": 6000, + "train time": 44.08414804900008, + "eval time": 16.39257521799982, + "tokens / sec": 4748.441543371237, + "mem allocated avg": 6893236697.088, + "mem reserved avg": 14706580389.888, + "elapsed time": 711.4482007859999 + }, + { + "step": 1750, + "valid accuracy": 0.48, + "train loss": 0.6278127529621125, + "train samples": 7000, + "train time": 44.35628801999951, + "eval time": 16.51757288099998, + "tokens / sec": 4719.849413584954, + "mem allocated avg": 6894834587.648, + "mem reserved avg": 14716881600.512, + "elapsed time": 832.303061434 + }, + { + "step": 2000, + "valid accuracy": 0.44, + "train loss": 0.6281237225532532, + "train samples": 8000, + "train time": 43.95804043099747, + "eval time": 16.465996583000106, + "tokens / sec": 4724.869397352412, + "mem allocated avg": 6891602710.528, + "mem reserved avg": 14655669927.936, + "elapsed time": 952.480474365 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6159191156625747, + "train samples": 9000, + "train time": 44.99231110500091, + "eval time": 16.5404373570002, + "tokens / sec": 4777.4385160692145, + "mem allocated avg": 6903352731.648, + "mem reserved avg": 14850520514.56, + "elapsed time": 1074.326083797 + }, + { + "step": 2500, + "valid accuracy": 0.44, + "train loss": 0.6119081476926803, + "train samples": 10000, + "train time": 43.74939265700118, + "eval time": 16.33099729599985, + "tokens / sec": 4707.882498273705, + "mem allocated avg": 6887975004.16, + "mem reserved avg": 14597494931.456, + "elapsed time": 1194.094911997 + }, + { + "step": 2750, + "valid accuracy": 0.44, + "train loss": 0.6010881408452987, + "train samples": 11000, + "train time": 43.686495668999896, + "eval time": 11.229614545000004, + "tokens / sec": 4850.0342441142875, + "mem allocated avg": 6899207546.88, + "mem reserved avg": 14785391362.048, + "elapsed time": 1308.783695182 + }, + { + "step": 3000, + "valid accuracy": 0.5, + "train loss": 0.5899516706466674, + "train samples": 12000, + "train time": 43.49030302700089, + "eval time": 16.45857661900004, + "tokens / sec": 4799.483688821613, + "mem allocated avg": 6894123913.216, + "mem reserved avg": 14693427052.544, + "elapsed time": 1428.4006117669999 + }, + { + "step": 3250, + "valid accuracy": 0.52, + "train loss": 0.5989595657587051, + "train samples": 13000, + "train time": 44.46332806799887, + "eval time": 16.496417500999996, + "tokens / sec": 4743.257177633304, + "mem allocated avg": 6895596777.472, + "mem reserved avg": 14723995140.096, + "elapsed time": 1549.445265484 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.579978278040886, + "train samples": 14000, + "train time": 43.63575344299579, + "eval time": 10.30441635599982, + "tokens / sec": 4806.838050224342, + "mem allocated avg": 6893774680.064, + "mem reserved avg": 14699450073.088, + "elapsed time": 1663.316950223 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.5772325273752212, + "train samples": 15000, + "train time": 45.25726027099972, + "eval time": 16.524598716000128, + "tokens / sec": 4788.2483098266675, + "mem allocated avg": 6905177583.616, + "mem reserved avg": 14889795977.216, + "elapsed time": 1785.1977310290001 + }, + { + "step": 4000, + "valid accuracy": 0.4, + "train loss": 0.5859311088323593, + "train samples": 16000, + "train time": 43.383903580999686, + "eval time": 16.386461492000308, + "tokens / sec": 4710.802466597467, + "mem allocated avg": 6886734053.376, + "mem reserved avg": 14584660361.216, + "elapsed time": 1904.6209389110002 + }, + { + "step": 4250, + "valid accuracy": 0.5, + "train loss": 0.5724418247938157, + "train samples": 17000, + "train time": 44.42285394400233, + "eval time": 9.048803244000283, + "tokens / sec": 4758.564145078759, + "mem allocated avg": 6896789555.2, + "mem reserved avg": 14740688470.016, + "elapsed time": 2018.321323589 + }, + { + "step": 4500, + "valid accuracy": 0.46, + "train loss": 0.5792494393587112, + "train samples": 18000, + "train time": 43.636566284001674, + "eval time": 16.3964514889999, + "tokens / sec": 4762.4737163655245, + "mem allocated avg": 6892818855.936, + "mem reserved avg": 14655921586.176, + "elapsed time": 2137.859151554 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.5680228790044785, + "train samples": 19000, + "train time": 43.96985955700529, + "eval time": 16.500367100000403, + "tokens / sec": 4774.61156608476, + "mem allocated avg": 6894185185.28, + "mem reserved avg": 14706722996.224, + "elapsed time": 2258.0618387639997 + }, + { + "step": 5000, + "valid accuracy": 0.44, + "train loss": 0.5760680929422378, + "train samples": 20000, + "train time": 43.83249596400128, + "eval time": 16.474086973999874, + "tokens / sec": 4751.7257555001215, + "mem allocated avg": 6891346642.944, + "mem reserved avg": 14655552487.424, + "elapsed time": 2377.7959423069997 + }, + { + "step": 5000, + "test accuracy": 0.5049279757391963, + "train loss": 0.5760680929422378, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.17.1.dev0", + "peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1010-aws", + "version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..89af4592383509c5f08d994cb24616421c056bf7 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-default.json @@ -0,0 +1,352 @@ +{ + "run_info": { + "created_at": "2025-08-14T12:42:42+00:00", + "total_time": 1917.9635583239997, + "experiment_name": "miss/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "MISS", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "miss_dropout": 0.0, + "mini_r": 1, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11170868939, + "accelerator_memory_max": 20248002560, + "accelerator_memory_reserved_99th": 16301393182, + "train_time": 1713.3205038909991, + "file_size": 29367496, + "num_trainable_params": 7340032, + "num_total_params": 3220089856, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.8771686832904816, + "train samples": 1000, + "train time": 29.625120898993828, + "eval time": 11.058316758999808, + "tokens / sec": 7146.603746254777, + "mem allocated avg": 6894354876.416, + "mem reserved avg": 11212691603.456, + "elapsed time": 89.49706801999992 + }, + { + "step": 500, + "valid accuracy": 0.42, + "train loss": 0.6949640859365463, + "train samples": 2000, + "train time": 29.06092399400177, + "eval time": 5.4939734129998214, + "tokens / sec": 7157.205326400859, + "mem allocated avg": 6887297284.096, + "mem reserved avg": 11116172279.808, + "elapsed time": 166.80778670399968 + }, + { + "step": 750, + "valid accuracy": 0.42, + "train loss": 0.6677889958620071, + "train samples": 3000, + "train time": 29.654036860997167, + "eval time": 6.225696284000151, + "tokens / sec": 7230.078016190556, + "mem allocated avg": 6897885888.512, + "mem reserved avg": 11257109282.816, + "elapsed time": 245.76960384799986 + }, + { + "step": 1000, + "valid accuracy": 0.38, + "train loss": 0.6483739440441132, + "train samples": 4000, + "train time": 28.837856293007462, + "eval time": 10.98694702900002, + "tokens / sec": 7224.392752470884, + "mem allocated avg": 6888501639.168, + "mem reserved avg": 11141564596.224, + "elapsed time": 328.02559429799976 + }, + { + "step": 1250, + "valid accuracy": 0.46, + "train loss": 0.6433384964466095, + "train samples": 5000, + "train time": 28.81160366599852, + "eval time": 7.707165779999741, + "tokens / sec": 7237.986556302045, + "mem allocated avg": 6888334700.544, + "mem reserved avg": 11139123511.296, + "elapsed time": 407.06604839199963 + }, + { + "step": 1500, + "valid accuracy": 0.5, + "train loss": 0.6369507477283478, + "train samples": 6000, + "train time": 28.99961056100119, + "eval time": 8.123675749000085, + "tokens / sec": 7218.407280320836, + "mem allocated avg": 6890289985.536, + "mem reserved avg": 11163484028.928, + "elapsed time": 486.7935630989996 + }, + { + "step": 1750, + "valid accuracy": 0.42, + "train loss": 0.6278414962291717, + "train samples": 7000, + "train time": 29.449354215004405, + "eval time": 11.046255440000095, + "tokens / sec": 7108.984410032798, + "mem allocated avg": 6891426932.736, + "mem reserved avg": 11175706230.784, + "elapsed time": 570.2098619899998 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.62835728931427, + "train samples": 8000, + "train time": 28.844003398995028, + "eval time": 11.063917559999936, + "tokens / sec": 7200.664801170994, + "mem allocated avg": 6888937164.8, + "mem reserved avg": 11125752070.144, + "elapsed time": 652.66749592 + }, + { + "step": 2250, + "valid accuracy": 0.46, + "train loss": 0.616273587346077, + "train samples": 9000, + "train time": 29.490800742004012, + "eval time": 8.136742810000214, + "tokens / sec": 7288.645767215389, + "mem allocated avg": 6899370121.216, + "mem reserved avg": 11286914007.04, + "elapsed time": 733.5518891469997 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6127588752508163, + "train samples": 10000, + "train time": 28.812003271001686, + "eval time": 11.006928690999757, + "tokens / sec": 7148.652527306175, + "mem allocated avg": 6884932614.144, + "mem reserved avg": 11077299470.336, + "elapsed time": 815.9489541349999 + }, + { + "step": 2750, + "valid accuracy": 0.48, + "train loss": 0.6011098005771637, + "train samples": 11000, + "train time": 29.451534630989954, + "eval time": 11.065294603999973, + "tokens / sec": 7194.226129630993, + "mem allocated avg": 6895703631.872, + "mem reserved avg": 11229007446.016, + "elapsed time": 899.4794512619997 + }, + { + "step": 3000, + "valid accuracy": 0.5, + "train loss": 0.590887265920639, + "train samples": 12000, + "train time": 29.118879764002486, + "eval time": 11.043336514999282, + "tokens / sec": 7168.235924310477, + "mem allocated avg": 6890226739.2, + "mem reserved avg": 11156563427.328, + "elapsed time": 982.334967583 + }, + { + "step": 3250, + "valid accuracy": 0.52, + "train loss": 0.6000960898399353, + "train samples": 13000, + "train time": 29.13528394500463, + "eval time": 11.077541423999719, + "tokens / sec": 7238.680096548703, + "mem allocated avg": 6892138940.416, + "mem reserved avg": 11182651998.208, + "elapsed time": 1065.5038535119998 + }, + { + "step": 3500, + "valid accuracy": 0.46, + "train loss": 0.5813224712610244, + "train samples": 14000, + "train time": 29.210709365002003, + "eval time": 11.022432370000388, + "tokens / sec": 7180.585633134473, + "mem allocated avg": 6891394273.28, + "mem reserved avg": 11167116296.192, + "elapsed time": 1148.6551861069997 + }, + { + "step": 3750, + "valid accuracy": 0.5, + "train loss": 0.5779635999202728, + "train samples": 15000, + "train time": 29.93350169399855, + "eval time": 11.012248770000042, + "tokens / sec": 7239.48043951862, + "mem allocated avg": 6900874864.64, + "mem reserved avg": 11322674642.944, + "elapsed time": 1233.146194169 + }, + { + "step": 4000, + "valid accuracy": 0.5, + "train loss": 0.5870059201717377, + "train samples": 16000, + "train time": 28.73894150599881, + "eval time": 11.028763495000021, + "tokens / sec": 7111.361424266106, + "mem allocated avg": 6883623936.0, + "mem reserved avg": 11058022449.152, + "elapsed time": 1315.630321268 + }, + { + "step": 4250, + "valid accuracy": 0.48, + "train loss": 0.5732149496078491, + "train samples": 17000, + "train time": 29.274482168998475, + "eval time": 11.023004681000202, + "tokens / sec": 7220.930460175991, + "mem allocated avg": 6893432758.272, + "mem reserved avg": 11193867567.104, + "elapsed time": 1399.0288222240001 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5802423723936081, + "train samples": 18000, + "train time": 28.807760504997532, + "eval time": 11.07264679400032, + "tokens / sec": 7213.958890138232, + "mem allocated avg": 6888416004.096, + "mem reserved avg": 11124485390.336, + "elapsed time": 1481.5334540769995 + }, + { + "step": 4750, + "valid accuracy": 0.52, + "train loss": 0.5696245921850205, + "train samples": 19000, + "train time": 29.20943511798214, + "eval time": 11.082792330000302, + "tokens / sec": 7187.369394581538, + "mem allocated avg": 6890813089.792, + "mem reserved avg": 11168844349.44, + "elapsed time": 1565.0750862589998 + }, + { + "step": 5000, + "valid accuracy": 0.5, + "train loss": 0.5774132673740386, + "train samples": 20000, + "train time": 29.084354959996745, + "eval time": 11.092419973000688, + "tokens / sec": 7161.238414483417, + "mem allocated avg": 6887869800.448, + "mem reserved avg": 11118328152.064, + "elapsed time": 1648.4280528519998 + }, + { + "step": 5000, + "test accuracy": 0.5087187263078089, + "train loss": 0.5774132673740386, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.17.1.dev0", + "peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1010-aws", + "version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-mini.json b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-mini.json new file mode 100644 index 0000000000000000000000000000000000000000..66e5f975921692c2d4349a18e3d5975511344864 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/miss--llama-3.2-3B-mini.json @@ -0,0 +1,352 @@ +{ + "run_info": { + "created_at": "2025-08-14T13:14:44+00:00", + "total_time": 1939.2463944070005, + "experiment_name": "miss/llama-3.2-3B-mini", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "MISS", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 64, + "miss_dropout": 0.0, + "mini_r": 64, + "target_modules": [ + "q_proj", + "v_proj" + ], + "exclude_modules": null, + "init_weights": "mini", + "layers_to_transform": null, + "layers_pattern": null, + "bias": "none", + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11076096963, + "accelerator_memory_max": 20189282304, + "accelerator_memory_reserved_99th": 16251103477, + "train_time": 1757.4722608399989, + "file_size": 924568, + "num_trainable_params": 229376, + "num_total_params": 3212979200, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.34, + "train loss": 1.0204485692977905, + "train samples": 1000, + "train time": 30.37152520300151, + "eval time": 11.248587610999493, + "tokens / sec": 6970.970294869372, + "mem allocated avg": 6780477966.336, + "mem reserved avg": 11118412038.144, + "elapsed time": 90.66597219600044 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.747962894320488, + "train samples": 2000, + "train time": 29.572977570002877, + "eval time": 11.171488900999975, + "tokens / sec": 7033.278928631729, + "mem allocated avg": 6773512382.464, + "mem reserved avg": 11022605746.176, + "elapsed time": 174.9062917200008 + }, + { + "step": 750, + "valid accuracy": 0.36, + "train loss": 0.7062408643960952, + "train samples": 3000, + "train time": 30.206997992999277, + "eval time": 8.297702855000352, + "tokens / sec": 7097.726164304351, + "mem allocated avg": 6784103079.936, + "mem reserved avg": 11160933892.096, + "elapsed time": 257.1565456070002 + }, + { + "step": 1000, + "valid accuracy": 0.42, + "train loss": 0.688418450832367, + "train samples": 4000, + "train time": 29.89673271099673, + "eval time": 8.431126997999854, + "tokens / sec": 6968.520674614356, + "mem allocated avg": 6774552799.232, + "mem reserved avg": 11046932709.376, + "elapsed time": 338.98425150300045 + }, + { + "step": 1250, + "valid accuracy": 0.26, + "train loss": 0.6864906589984894, + "train samples": 5000, + "train time": 29.511754502003896, + "eval time": 11.189089829000295, + "tokens / sec": 7066.269136450018, + "mem allocated avg": 6774476697.6, + "mem reserved avg": 11043107504.128, + "elapsed time": 423.2539680640002 + }, + { + "step": 1500, + "valid accuracy": 0.34, + "train loss": 0.6819815402030944, + "train samples": 6000, + "train time": 29.53373699200074, + "eval time": 10.82430943000054, + "tokens / sec": 7087.860234439605, + "mem allocated avg": 6776410671.104, + "mem reserved avg": 11066327171.072, + "elapsed time": 507.00703521000014 + }, + { + "step": 1750, + "valid accuracy": 0.24, + "train loss": 0.6748508417606354, + "train samples": 7000, + "train time": 29.92787808700814, + "eval time": 11.135467017999872, + "tokens / sec": 6995.317188587526, + "mem allocated avg": 6777799206.912, + "mem reserved avg": 11081728655.36, + "elapsed time": 591.3889150910009 + }, + { + "step": 2000, + "valid accuracy": 0.32, + "train loss": 0.6793494290113449, + "train samples": 8000, + "train time": 29.828631155996845, + "eval time": 7.671241181000369, + "tokens / sec": 6962.974563391727, + "mem allocated avg": 6775091949.568, + "mem reserved avg": 11030155493.376, + "elapsed time": 672.1807907450002 + }, + { + "step": 2250, + "valid accuracy": 0.36, + "train loss": 0.6712708432674408, + "train samples": 9000, + "train time": 30.12409129900061, + "eval time": 7.389505904999169, + "tokens / sec": 7135.418554754249, + "mem allocated avg": 6785428178.944, + "mem reserved avg": 11193422970.88, + "elapsed time": 753.4013944540002 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.670761358499527, + "train samples": 10000, + "train time": 29.392454811994867, + "eval time": 11.273840481000661, + "tokens / sec": 7007.478664760802, + "mem allocated avg": 6770948311.04, + "mem reserved avg": 10981837111.296, + "elapsed time": 837.4394236850003 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.6636076529026032, + "train samples": 11000, + "train time": 30.132865259004575, + "eval time": 7.065399131999584, + "tokens / sec": 7031.558339334618, + "mem allocated avg": 6781682612.224, + "mem reserved avg": 11132194521.088, + "elapsed time": 918.3523091420002 + }, + { + "step": 3000, + "valid accuracy": 0.3, + "train loss": 0.6547267787456512, + "train samples": 12000, + "train time": 29.80804876098864, + "eval time": 6.651864860000387, + "tokens / sec": 7002.504648112936, + "mem allocated avg": 6776379066.368, + "mem reserved avg": 11060597751.808, + "elapsed time": 998.1520945420007 + }, + { + "step": 3250, + "valid accuracy": 0.36, + "train loss": 0.6653184123039245, + "train samples": 13000, + "train time": 29.843793005994485, + "eval time": 11.134645372000705, + "tokens / sec": 7066.829607001965, + "mem allocated avg": 6778676955.136, + "mem reserved avg": 11088607313.92, + "elapsed time": 1082.892349787 + }, + { + "step": 3500, + "valid accuracy": 0.4, + "train loss": 0.6504588623046875, + "train samples": 14000, + "train time": 30.091547277996142, + "eval time": 11.186960818999978, + "tokens / sec": 6970.395974067295, + "mem allocated avg": 6777435619.328, + "mem reserved avg": 11074858385.408, + "elapsed time": 1168.1813894270008 + }, + { + "step": 3750, + "valid accuracy": 0.38, + "train loss": 0.6486766980886459, + "train samples": 15000, + "train time": 30.235947965997184, + "eval time": 6.424060680000366, + "tokens / sec": 7167.064854182855, + "mem allocated avg": 6787226097.664, + "mem reserved avg": 11226214039.552, + "elapsed time": 1249.1344440330004 + }, + { + "step": 4000, + "valid accuracy": 0.34, + "train loss": 0.6649546232223511, + "train samples": 16000, + "train time": 29.315789502004918, + "eval time": 6.29557701000067, + "tokens / sec": 6971.430872977951, + "mem allocated avg": 6769964711.936, + "mem reserved avg": 10964573356.032, + "elapsed time": 1328.0223749040006 + }, + { + "step": 4250, + "valid accuracy": 0.38, + "train loss": 0.6468708947896957, + "train samples": 17000, + "train time": 29.780288893992292, + "eval time": 6.263248704000034, + "tokens / sec": 7098.2857403591015, + "mem allocated avg": 6779703865.344, + "mem reserved avg": 11102406574.08, + "elapsed time": 1408.0423461730006 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.6554104331731796, + "train samples": 18000, + "train time": 29.55899746599971, + "eval time": 8.381054077999579, + "tokens / sec": 7030.617335349179, + "mem allocated avg": 6774673686.528, + "mem reserved avg": 11030071607.296, + "elapsed time": 1489.5959585560004 + }, + { + "step": 4750, + "valid accuracy": 0.3, + "train loss": 0.6466003597974778, + "train samples": 19000, + "train time": 29.626044395983627, + "eval time": 8.314826920000087, + "tokens / sec": 7086.298703733166, + "mem allocated avg": 6776780376.064, + "mem reserved avg": 11071855263.744, + "elapsed time": 1571.4978439640008 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.6535431078672409, + "train samples": 20000, + "train time": 29.328363572000853, + "eval time": 8.339948383999399, + "tokens / sec": 7101.657734454723, + "mem allocated avg": 6774118805.504, + "mem reserved avg": 11025097162.752, + "elapsed time": 1652.8993119440001 + }, + { + "step": 5000, + "test accuracy": 0.3912054586808188, + "train loss": 0.6535431078672409, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.17.1.dev0", + "peft-commit-hash": "47961bb54706e45fd3b5460baa4921a48bcdce35", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.14.0-1010-aws", + "version": "#10~24.04.1-Ubuntu SMP Fri Jul 18 20:44:30 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json b/peft/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json new file mode 100644 index 0000000000000000000000000000000000000000..b57f300fa300c0d85a3554971d8da372e1baa254 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/oft--llama-3.2-3B-rank32.json @@ -0,0 +1,358 @@ +{ + "run_info": { + "created_at": "2025-07-31T14:11:12+00:00", + "total_time": 2493.9155955019996, + "experiment_name": "oft/llama-3.2-3B-rank32", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "OFT", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "oft_block_size": 0, + "module_dropout": 0.0, + "target_modules": [ + "v_proj", + "q_proj" + ], + "fan_in_fan_out": false, + "bias": "none", + "exclude_modules": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null, + "modules_to_save": null, + "coft": false, + "eps": 6e-05, + "block_share": false, + "use_cayley_neumann": true, + "num_cayley_neumann_terms": 5 + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12057354384, + "accelerator_memory_max": 22294822912, + "accelerator_memory_reserved_99th": 17939310837, + "train_time": 2214.446992367006, + "file_size": 32693568, + "num_trainable_params": 8171520, + "num_total_params": 3220921344, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.36, + "train loss": 0.9631057088375091, + "train samples": 1000, + "train time": 43.418166981995455, + "eval time": 16.96007740999994, + "tokens / sec": 4876.276791873667, + "mem allocated avg": 6903823460.352, + "mem reserved avg": 12108561383.424, + "elapsed time": 113.91408998500083 + }, + { + "step": 500, + "valid accuracy": 0.36, + "train loss": 0.7144306401014328, + "train samples": 2000, + "train time": 42.455775934988196, + "eval time": 16.150497423999695, + "tokens / sec": 4899.097835792689, + "mem allocated avg": 6896105342.976, + "mem reserved avg": 11994249822.208, + "elapsed time": 220.49977440600014 + }, + { + "step": 750, + "valid accuracy": 0.52, + "train loss": 0.6711842056512832, + "train samples": 3000, + "train time": 43.15603912099323, + "eval time": 10.51256339000065, + "tokens / sec": 4968.041654585135, + "mem allocated avg": 6906686986.24, + "mem reserved avg": 12155101380.608, + "elapsed time": 322.5515955810006 + }, + { + "step": 1000, + "valid accuracy": 0.48, + "train loss": 0.6508683092594146, + "train samples": 4000, + "train time": 42.42713372799517, + "eval time": 16.934662378998837, + "tokens / sec": 4910.442485595753, + "mem allocated avg": 6897939019.776, + "mem reserved avg": 12025262505.984, + "elapsed time": 429.7382754350001 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.6453732433319092, + "train samples": 5000, + "train time": 42.549762738994104, + "eval time": 16.92903551499876, + "tokens / sec": 4901.03790423462, + "mem allocated avg": 6897900118.016, + "mem reserved avg": 12017234608.128, + "elapsed time": 537.135011331 + }, + { + "step": 1500, + "valid accuracy": 0.5, + "train loss": 0.636857116818428, + "train samples": 6000, + "train time": 42.7670685170051, + "eval time": 16.97714005600028, + "tokens / sec": 4894.677312679627, + "mem allocated avg": 6899436058.624, + "mem reserved avg": 12045822984.192, + "elapsed time": 644.8122739440005 + }, + { + "step": 1750, + "valid accuracy": 0.48, + "train loss": 0.6280697054862976, + "train samples": 7000, + "train time": 42.93359049599712, + "eval time": 11.770931148001182, + "tokens / sec": 4876.251848060996, + "mem allocated avg": 6900382935.04, + "mem reserved avg": 12059630632.96, + "elapsed time": 747.525349122001 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.6299525223970414, + "train samples": 8000, + "train time": 42.82682755300084, + "eval time": 11.5680384089992, + "tokens / sec": 4849.670448808364, + "mem allocated avg": 6896952041.472, + "mem reserved avg": 12003611508.736, + "elapsed time": 849.5279627600012 + }, + { + "step": 2250, + "valid accuracy": 0.42, + "train loss": 0.6208749743700027, + "train samples": 9000, + "train time": 43.43083962600576, + "eval time": 16.986704689999897, + "tokens / sec": 4949.20203825146, + "mem allocated avg": 6908628027.392, + "mem reserved avg": 12188169273.344, + "elapsed time": 958.0240945160003 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6179436918497085, + "train samples": 10000, + "train time": 42.63891591101674, + "eval time": 17.232789900999705, + "tokens / sec": 4830.493355643306, + "mem allocated avg": 6893492830.208, + "mem reserved avg": 11953867063.296, + "elapsed time": 1065.2266578140006 + }, + { + "step": 2750, + "valid accuracy": 0.42, + "train loss": 0.6097300077676773, + "train samples": 11000, + "train time": 43.157022238001446, + "eval time": 17.135427543998958, + "tokens / sec": 4909.537058222485, + "mem allocated avg": 6904392247.296, + "mem reserved avg": 12124977889.28, + "elapsed time": 1173.5244531360004 + }, + { + "step": 3000, + "valid accuracy": 0.42, + "train loss": 0.600518134355545, + "train samples": 12000, + "train time": 42.90499155000907, + "eval time": 17.038416949999373, + "tokens / sec": 4864.958422301702, + "mem allocated avg": 6898886381.568, + "mem reserved avg": 12038994657.28, + "elapsed time": 1281.100714346001 + }, + { + "step": 3250, + "valid accuracy": 0.54, + "train loss": 0.6095727566480637, + "train samples": 13000, + "train time": 42.991201876006016, + "eval time": 17.145920277998812, + "tokens / sec": 4905.678157318666, + "mem allocated avg": 6900920473.6, + "mem reserved avg": 12070426771.456, + "elapsed time": 1389.080374264 + }, + { + "step": 3500, + "valid accuracy": 0.54, + "train loss": 0.59402192902565, + "train samples": 14000, + "train time": 43.139979139998104, + "eval time": 10.18719298600081, + "tokens / sec": 4862.079309758545, + "mem allocated avg": 6899826102.272, + "mem reserved avg": 12054404530.176, + "elapsed time": 1490.7450829120007 + }, + { + "step": 3750, + "valid accuracy": 0.58, + "train loss": 0.5927710949182511, + "train samples": 15000, + "train time": 43.49427866901169, + "eval time": 10.884315328999946, + "tokens / sec": 4982.333461582249, + "mem allocated avg": 6910839183.36, + "mem reserved avg": 12223619530.752, + "elapsed time": 1593.6702795590008 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.6036465883255004, + "train samples": 16000, + "train time": 42.54699739801072, + "eval time": 10.508950370000093, + "tokens / sec": 4803.464697829781, + "mem allocated avg": 6892073494.528, + "mem reserved avg": 11931788247.04, + "elapsed time": 1694.1543825910012 + }, + { + "step": 4250, + "valid accuracy": 0.5, + "train loss": 0.5904108211994171, + "train samples": 17000, + "train time": 42.904117188016244, + "eval time": 10.362485865000053, + "tokens / sec": 4927.009663749569, + "mem allocated avg": 6902539771.904, + "mem reserved avg": 12087044603.904, + "elapsed time": 1795.3652429800004 + }, + { + "step": 4500, + "valid accuracy": 0.56, + "train loss": 0.5975252593755722, + "train samples": 18000, + "train time": 42.7045542899923, + "eval time": 9.970661539999128, + "tokens / sec": 4866.413043179837, + "mem allocated avg": 6897064284.16, + "mem reserved avg": 12006883065.856, + "elapsed time": 1895.7771126360003 + }, + { + "step": 4750, + "valid accuracy": 0.54, + "train loss": 0.588557964682579, + "train samples": 19000, + "train time": 42.698231221012975, + "eval time": 10.72399718899942, + "tokens / sec": 4916.8078863342525, + "mem allocated avg": 6900484192.256, + "mem reserved avg": 12052575813.632, + "elapsed time": 1997.1282366079995 + }, + { + "step": 5000, + "valid accuracy": 0.56, + "train loss": 0.5946548076868057, + "train samples": 20000, + "train time": 42.98944765599845, + "eval time": 10.321189939999385, + "tokens / sec": 4844.909887343902, + "mem allocated avg": 6896923324.416, + "mem reserved avg": 12004861411.328, + "elapsed time": 2098.129397994 + }, + { + "step": 5000, + "test accuracy": 0.5056861258529188, + "train loss": 0.5946548076868057, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.16.1.dev0", + "peft-commit-hash": "25e5c6b25c4589eb2683484ede1ba3d985d8a760", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1031-aws", + "version": "#33-Ubuntu SMP Fri Jun 20 18:11:07 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json b/peft/method_comparison/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..9c1717d39a8e3e171f29ff94bad443c9e912c558 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/prefixtuning--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,345 @@ +{ + "run_info": { + "created_at": "2025-06-19T20:20:55+00:00", + "total_time": 1959.214138561998, + "experiment_name": "prefixtuning/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PREFIX_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 1024, + "num_transformer_submodules": 1, + "num_attention_heads": 8, + "num_layers": 28, + "encoder_hidden_size": 3072, + "prefix_projection": false + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11766684083, + "accelerator_memory_max": 20912799744, + "accelerator_memory_reserved_99th": 16945051074, + "train_time": 1661.6597991429953, + "file_size": 45875328, + "num_trainable_params": 11468800, + "num_total_params": 3224218624, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 7.371294965744019, + "train samples": 1000, + "train time": 27.91846932898261, + "eval time": 15.451216622001084, + "tokens / sec": 7583.474491569318, + "mem allocated avg": 7053410574.336, + "mem reserved avg": 11800925962.24, + "elapsed time": 86.14553656399949 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 3.853111123085022, + "train samples": 2000, + "train time": 27.30431010902612, + "eval time": 15.427179872000124, + "tokens / sec": 7617.661796598262, + "mem allocated avg": 7047124914.176, + "mem reserved avg": 11721854943.232, + "elapsed time": 164.76258564100135 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.7293416724205017, + "train samples": 3000, + "train time": 28.03611285903753, + "eval time": 15.425274275999982, + "tokens / sec": 7647.31548478152, + "mem allocated avg": 7057104787.456, + "mem reserved avg": 11848237711.36, + "elapsed time": 244.72437485599949 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.1541715533733368, + "train samples": 4000, + "train time": 27.01217528603229, + "eval time": 15.417352960001153, + "tokens / sec": 7712.670223479868, + "mem allocated avg": 7050079920.128, + "mem reserved avg": 11745879916.544, + "elapsed time": 322.8701755410002 + }, + { + "step": 1250, + "valid accuracy": 0.08, + "train loss": 1.01127068066597, + "train samples": 5000, + "train time": 27.13179545197636, + "eval time": 15.418993674997182, + "tokens / sec": 7686.111314273877, + "mem allocated avg": 7048705087.488, + "mem reserved avg": 11725462044.672, + "elapsed time": 401.46412933300235 + }, + { + "step": 1500, + "valid accuracy": 0.08, + "train loss": 0.9543052833080292, + "train samples": 6000, + "train time": 27.5597544680204, + "eval time": 15.42255902100078, + "tokens / sec": 7595.532109798078, + "mem allocated avg": 7050476988.416, + "mem reserved avg": 11746567782.4, + "elapsed time": 480.1674746890021 + }, + { + "step": 1750, + "valid accuracy": 0.18, + "train loss": 0.9019801757335663, + "train samples": 7000, + "train time": 27.391848403010954, + "eval time": 15.41637391599943, + "tokens / sec": 7642.967240465137, + "mem allocated avg": 7051827261.44, + "mem reserved avg": 11769468682.24, + "elapsed time": 559.677030461 + }, + { + "step": 2000, + "valid accuracy": 0.12, + "train loss": 0.8851136872768403, + "train samples": 8000, + "train time": 27.14071328902719, + "eval time": 15.419172215999424, + "tokens / sec": 7652.562325396589, + "mem allocated avg": 7048325701.632, + "mem reserved avg": 11717526421.504, + "elapsed time": 638.8433813260017 + }, + { + "step": 2250, + "valid accuracy": 0.1, + "train loss": 0.8607708604335785, + "train samples": 9000, + "train time": 28.18215358697489, + "eval time": 15.430102889000409, + "tokens / sec": 7627.096323090928, + "mem allocated avg": 7058774517.76, + "mem reserved avg": 11887655780.352, + "elapsed time": 719.1557081280007 + }, + { + "step": 2500, + "valid accuracy": 0.16, + "train loss": 0.8404088478088378, + "train samples": 10000, + "train time": 26.82789152296391, + "eval time": 15.41505262499777, + "tokens / sec": 7677.3457885685175, + "mem allocated avg": 7045414729.728, + "mem reserved avg": 11679693799.424, + "elapsed time": 797.9182705759995 + }, + { + "step": 2750, + "valid accuracy": 0.14, + "train loss": 0.8259119842052459, + "train samples": 11000, + "train time": 27.303442178006662, + "eval time": 15.412094721999892, + "tokens / sec": 7760.230326221408, + "mem allocated avg": 7055038418.944, + "mem reserved avg": 11819196350.464, + "elapsed time": 877.4897030700013 + }, + { + "step": 3000, + "valid accuracy": 0.22, + "train loss": 0.8099327564239502, + "train samples": 12000, + "train time": 27.035110770961182, + "eval time": 12.827202022002894, + "tokens / sec": 7720.737738726083, + "mem allocated avg": 7049757696.0, + "mem reserved avg": 11756390842.368, + "elapsed time": 953.558885925002 + }, + { + "step": 3250, + "valid accuracy": 0.22, + "train loss": 0.8175602672100067, + "train samples": 13000, + "train time": 27.43706444997588, + "eval time": 15.41863539300175, + "tokens / sec": 7686.718831911532, + "mem allocated avg": 7051605612.544, + "mem reserved avg": 11776833880.064, + "elapsed time": 1033.2767371779992 + }, + { + "step": 3500, + "valid accuracy": 0.18, + "train loss": 0.7965063021183014, + "train samples": 14000, + "train time": 27.750962379970588, + "eval time": 15.41753050600164, + "tokens / sec": 7558.296434122523, + "mem allocated avg": 7051713462.272, + "mem reserved avg": 11763227557.888, + "elapsed time": 1113.7006878970024 + }, + { + "step": 3750, + "valid accuracy": 0.26, + "train loss": 0.788856605052948, + "train samples": 15000, + "train time": 27.955327479998232, + "eval time": 11.66996129099789, + "tokens / sec": 7751.760381095479, + "mem allocated avg": 7061477945.344, + "mem reserved avg": 11919800926.208, + "elapsed time": 1190.2235273900005 + }, + { + "step": 4000, + "valid accuracy": 0.26, + "train loss": 0.8037499711513519, + "train samples": 16000, + "train time": 26.957003097031702, + "eval time": 15.42233503099851, + "tokens / sec": 7581.443651742726, + "mem allocated avg": 7042861262.848, + "mem reserved avg": 11658604838.912, + "elapsed time": 1268.9300010120023 + }, + { + "step": 4250, + "valid accuracy": 0.28, + "train loss": 0.7835113587379455, + "train samples": 17000, + "train time": 27.92120910200174, + "eval time": 10.70234186000016, + "tokens / sec": 7570.911389537389, + "mem allocated avg": 7053768085.504, + "mem reserved avg": 11783242776.576, + "elapsed time": 1344.1117449459998 + }, + { + "step": 4500, + "valid accuracy": 0.28, + "train loss": 0.7824292014837265, + "train samples": 18000, + "train time": 26.99022845998479, + "eval time": 12.42347607800184, + "tokens / sec": 7699.749570779183, + "mem allocated avg": 7048212195.328, + "mem reserved avg": 11725470433.28, + "elapsed time": 1419.3379556770014 + }, + { + "step": 4750, + "valid accuracy": 0.28, + "train loss": 0.7803363995552063, + "train samples": 19000, + "train time": 27.08754148402295, + "eval time": 15.42501401300251, + "tokens / sec": 7750.389607112494, + "mem allocated avg": 7051630567.424, + "mem reserved avg": 11771876212.736, + "elapsed time": 1498.5280245210015 + }, + { + "step": 5000, + "valid accuracy": 0.26, + "train loss": 0.7887116296291351, + "train samples": 20000, + "train time": 26.98893836600837, + "eval time": 15.411349758000142, + "tokens / sec": 7717.235749529201, + "mem allocated avg": 7048690728.96, + "mem reserved avg": 11715764813.824, + "elapsed time": 1577.725424725002 + }, + { + "step": 5000, + "test accuracy": 0.1470811220621683, + "train loss": 0.7887116296291351, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..013c9ebf413306dd841d2bf777f0c7084b56e13d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-default.json @@ -0,0 +1,348 @@ +{ + "run_info": { + "created_at": "2025-06-20T08:46:44+00:00", + "total_time": 2700.1305744579877, + "experiment_name": "prompt_tuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PROMPT_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "tokenizer_name_or_path": null, + "tokenizer_kwargs": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 15297773830, + "accelerator_memory_max": 24379392000, + "accelerator_memory_reserved_99th": 20669781770, + "train_time": 2379.557773831024, + "file_size": 2457728, + "num_trainable_params": 614400, + "num_total_params": 3213364224, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 3.462425223350525, + "train samples": 1000, + "train time": 46.206722402057494, + "eval time": 15.901069569998072, + "tokens / sec": 4581.9956273412845, + "mem allocated avg": 7082871494.656, + "mem reserved avg": 15331489742.848, + "elapsed time": 119.40567356300016 + }, + { + "step": 500, + "valid accuracy": 0.0, + "train loss": 2.259350722312927, + "train samples": 2000, + "train time": 45.66361523300293, + "eval time": 15.856271529002697, + "tokens / sec": 4554.939396249854, + "mem allocated avg": 7075523266.56, + "mem reserved avg": 15240674672.64, + "elapsed time": 232.12755202699918 + }, + { + "step": 750, + "valid accuracy": 0.0, + "train loss": 1.758247773170471, + "train samples": 3000, + "train time": 46.58154148896574, + "eval time": 15.854417883005226, + "tokens / sec": 4602.70298377282, + "mem allocated avg": 7085465481.216, + "mem reserved avg": 15376771448.832, + "elapsed time": 346.0752758900053 + }, + { + "step": 1000, + "valid accuracy": 0.0, + "train loss": 1.6028480381965637, + "train samples": 4000, + "train time": 45.41573346107907, + "eval time": 15.861826895998092, + "tokens / sec": 4587.30893729906, + "mem allocated avg": 7077486481.408, + "mem reserved avg": 15288170971.136, + "elapsed time": 458.6240012299968 + }, + { + "step": 1250, + "valid accuracy": 0.0, + "train loss": 1.5049157681465148, + "train samples": 5000, + "train time": 46.04039786210342, + "eval time": 15.877354786993237, + "tokens / sec": 4529.456948321703, + "mem allocated avg": 7076584331.264, + "mem reserved avg": 15265983102.976, + "elapsed time": 571.9228152269934 + }, + { + "step": 1500, + "valid accuracy": 0.0, + "train loss": 1.4375499501228333, + "train samples": 6000, + "train time": 45.70124057796784, + "eval time": 15.84707298700232, + "tokens / sec": 4580.4227052190045, + "mem allocated avg": 7078481408.0, + "mem reserved avg": 15279463596.032, + "elapsed time": 684.8850296739984 + }, + { + "step": 1750, + "valid accuracy": 0.0, + "train loss": 1.3827230257987977, + "train samples": 7000, + "train time": 44.976750778907444, + "eval time": 15.845691901995451, + "tokens / sec": 4654.7382008346485, + "mem allocated avg": 7079360505.856, + "mem reserved avg": 15298052751.36, + "elapsed time": 796.8428356289951 + }, + { + "step": 2000, + "valid accuracy": 0.0, + "train loss": 1.3338124525547028, + "train samples": 8000, + "train time": 45.10262611102371, + "eval time": 15.857041016992298, + "tokens / sec": 4604.964675199615, + "mem allocated avg": 7075931449.344, + "mem reserved avg": 15257242173.44, + "elapsed time": 908.9726742479979 + }, + { + "step": 2250, + "valid accuracy": 0.0, + "train loss": 1.2829065501689911, + "train samples": 9000, + "train time": 46.84363810600189, + "eval time": 15.872781344005489, + "tokens / sec": 4588.627371631486, + "mem allocated avg": 7087554078.72, + "mem reserved avg": 15416986435.584, + "elapsed time": 1023.331907868007 + }, + { + "step": 2500, + "valid accuracy": 0.0, + "train loss": 1.2462495183944702, + "train samples": 10000, + "train time": 45.55510413390584, + "eval time": 15.84976143699896, + "tokens / sec": 4521.271631705095, + "mem allocated avg": 7072915062.784, + "mem reserved avg": 15202909159.424, + "elapsed time": 1136.1328145180014 + }, + { + "step": 2750, + "valid accuracy": 0.0, + "train loss": 1.2045790712833404, + "train samples": 11000, + "train time": 45.34144312601711, + "eval time": 15.8525270359969, + "tokens / sec": 4673.009621928461, + "mem allocated avg": 7083153442.816, + "mem reserved avg": 15344005545.984, + "elapsed time": 1248.7101804669946 + }, + { + "step": 3000, + "valid accuracy": 0.0, + "train loss": 1.1678078708648683, + "train samples": 12000, + "train time": 45.599694666831056, + "eval time": 15.870247816987103, + "tokens / sec": 4577.464860786221, + "mem allocated avg": 7077996111.872, + "mem reserved avg": 15283892781.056, + "elapsed time": 1361.5449211609957 + }, + { + "step": 3250, + "valid accuracy": 0.04, + "train loss": 1.1313301923274994, + "train samples": 13000, + "train time": 45.95094640579191, + "eval time": 15.868188906999421, + "tokens / sec": 4589.698722144641, + "mem allocated avg": 7079686449.152, + "mem reserved avg": 15301248811.008, + "elapsed time": 1474.734694629995 + }, + { + "step": 3500, + "valid accuracy": 0.06, + "train loss": 1.1092858843803406, + "train samples": 14000, + "train time": 45.96525488591578, + "eval time": 15.86030059499899, + "tokens / sec": 4563.229346178814, + "mem allocated avg": 7078805225.472, + "mem reserved avg": 15302347718.656, + "elapsed time": 1588.1363447299955 + }, + { + "step": 3750, + "valid accuracy": 0.06, + "train loss": 1.079538120508194, + "train samples": 15000, + "train time": 46.46510764303093, + "eval time": 15.86466599200503, + "tokens / sec": 4663.779145091515, + "mem allocated avg": 7089610215.424, + "mem reserved avg": 15446287843.328, + "elapsed time": 1702.2553167559963 + }, + { + "step": 4000, + "valid accuracy": 0.04, + "train loss": 1.0899075508117675, + "train samples": 16000, + "train time": 45.08557640206709, + "eval time": 15.860410296008922, + "tokens / sec": 4533.001822521445, + "mem allocated avg": 7071494891.52, + "mem reserved avg": 15189319614.464, + "elapsed time": 1814.3939928110049 + }, + { + "step": 4250, + "valid accuracy": 0.04, + "train loss": 1.0607522547245025, + "train samples": 17000, + "train time": 46.2303190480452, + "eval time": 15.875090683999588, + "tokens / sec": 4572.518735601033, + "mem allocated avg": 7082239875.072, + "mem reserved avg": 15329283538.944, + "elapsed time": 1928.1608909490024 + }, + { + "step": 4500, + "valid accuracy": 0.04, + "train loss": 1.068591582775116, + "train samples": 18000, + "train time": 45.96484722109744, + "eval time": 15.854171614992083, + "tokens / sec": 4521.237697155087, + "mem allocated avg": 7076175783.936, + "mem reserved avg": 15251420479.488, + "elapsed time": 2041.5032397750037 + }, + { + "step": 4750, + "valid accuracy": 0.06, + "train loss": 1.0587167317867279, + "train samples": 19000, + "train time": 45.48911916205543, + "eval time": 15.858397545001935, + "tokens / sec": 4615.147619194169, + "mem allocated avg": 7079419088.896, + "mem reserved avg": 15298539290.624, + "elapsed time": 2154.3035376479966 + }, + { + "step": 5000, + "valid accuracy": 0.02, + "train loss": 1.0654937489032745, + "train samples": 20000, + "train time": 45.758550852071494, + "eval time": 15.85034008299408, + "tokens / sec": 4551.7175723796145, + "mem allocated avg": 7075618770.944, + "mem reserved avg": 15251386925.056, + "elapsed time": 2267.4055672899995 + }, + { + "step": 5000, + "test accuracy": 0.050037907505686124, + "train loss": 1.0654937489032745, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json b/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json new file mode 100644 index 0000000000000000000000000000000000000000..2ce456649c41cd57278667f453ef1b9e9e8406f5 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/prompt_tuning--llama-3.2-3B-lr_0.001.json @@ -0,0 +1,347 @@ +{ + "run_info": { + "created_at": "2025-06-20T08:01:25+00:00", + "total_time": 2714.5956150429993, + "experiment_name": "prompt_tuning/llama-3.2-3B-lr_0.001", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "PROMPT_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 200, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "tokenizer_name_or_path": null, + "tokenizer_kwargs": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 15297364466, + "accelerator_memory_max": 24408752128, + "accelerator_memory_reserved_99th": 20650676715, + "train_time": 2394.4007484640024, + "file_size": 2457728, + "num_trainable_params": 614400, + "num_total_params": 3213364224, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 2.454602773666382, + "train samples": 1000, + "train time": 46.58359175696387, + "eval time": 15.906975480989786, + "tokens / sec": 4544.926486231061, + "mem allocated avg": 7082736850.944, + "mem reserved avg": 15330147565.568, + "elapsed time": 120.51601758999459 + }, + { + "step": 500, + "valid accuracy": 0.02, + "train loss": 1.4034885478019714, + "train samples": 2000, + "train time": 45.99672341402038, + "eval time": 15.859127072995761, + "tokens / sec": 4521.952534049426, + "mem allocated avg": 7075398952.96, + "mem reserved avg": 15237637996.544, + "elapsed time": 234.56530582100095 + }, + { + "step": 750, + "valid accuracy": 0.1, + "train loss": 1.051814435005188, + "train samples": 3000, + "train time": 45.34941398198134, + "eval time": 15.839738530004979, + "tokens / sec": 4727.756792738001, + "mem allocated avg": 7085216630.784, + "mem reserved avg": 15378130403.328, + "elapsed time": 347.9996997119888 + }, + { + "step": 1000, + "valid accuracy": 0.2, + "train loss": 0.9425526282787323, + "train samples": 4000, + "train time": 44.85872337181354, + "eval time": 15.849193180998554, + "tokens / sec": 4644.269482954245, + "mem allocated avg": 7077280739.328, + "mem reserved avg": 15280109518.848, + "elapsed time": 460.8599872249906 + }, + { + "step": 1250, + "valid accuracy": 0.2, + "train loss": 0.9085307500362396, + "train samples": 5000, + "train time": 45.535731699026655, + "eval time": 15.864107311004773, + "tokens / sec": 4579.656287909338, + "mem allocated avg": 7076838449.152, + "mem reserved avg": 15263508463.616, + "elapsed time": 574.5614464429964 + }, + { + "step": 1500, + "valid accuracy": 0.18, + "train loss": 0.8753413548469543, + "train samples": 6000, + "train time": 45.47140344994841, + "eval time": 15.851111587006017, + "tokens / sec": 4603.5746451155, + "mem allocated avg": 7078501443.584, + "mem reserved avg": 15280914825.216, + "elapsed time": 688.3081236659928 + }, + { + "step": 1750, + "valid accuracy": 0.18, + "train loss": 0.8501973593235016, + "train samples": 7000, + "train time": 45.876367467062664, + "eval time": 15.86328411300201, + "tokens / sec": 4563.460700115549, + "mem allocated avg": 7079126001.664, + "mem reserved avg": 15302154780.672, + "elapsed time": 802.3839824919996 + }, + { + "step": 2000, + "valid accuracy": 0.3, + "train loss": 0.8353641645908356, + "train samples": 8000, + "train time": 45.395122604924836, + "eval time": 15.847279680005158, + "tokens / sec": 4575.293293237354, + "mem allocated avg": 7075813670.912, + "mem reserved avg": 15257200230.4, + "elapsed time": 915.8055839799927 + }, + { + "step": 2250, + "valid accuracy": 0.26, + "train loss": 0.8205823216438294, + "train samples": 9000, + "train time": 46.531550297062495, + "eval time": 15.857669960998464, + "tokens / sec": 4619.403364550472, + "mem allocated avg": 7087054014.464, + "mem reserved avg": 15417707855.872, + "elapsed time": 1030.8605109579948 + }, + { + "step": 2500, + "valid accuracy": 0.24, + "train loss": 0.8074139108657837, + "train samples": 10000, + "train time": 45.232053409854416, + "eval time": 15.864067172005889, + "tokens / sec": 4553.562893413265, + "mem allocated avg": 7073174814.72, + "mem reserved avg": 15210467295.232, + "elapsed time": 1144.3065934619954 + }, + { + "step": 2750, + "valid accuracy": 0.22, + "train loss": 0.800323983669281, + "train samples": 11000, + "train time": 46.27672885800712, + "eval time": 15.85089660200174, + "tokens / sec": 4578.564760921707, + "mem allocated avg": 7083499849.728, + "mem reserved avg": 15345020567.552, + "elapsed time": 1258.9190711479896 + }, + { + "step": 3000, + "valid accuracy": 0.28, + "train loss": 0.7878623747825623, + "train samples": 12000, + "train time": 45.57083585388318, + "eval time": 15.872650785997394, + "tokens / sec": 4580.3636490071885, + "mem allocated avg": 7078042595.328, + "mem reserved avg": 15285402730.496, + "elapsed time": 1372.7267461329902 + }, + { + "step": 3250, + "valid accuracy": 0.3, + "train loss": 0.7943042907714843, + "train samples": 13000, + "train time": 45.666222987070796, + "eval time": 15.852009978989372, + "tokens / sec": 4618.314942746877, + "mem allocated avg": 7079504875.52, + "mem reserved avg": 15299428483.072, + "elapsed time": 1486.5100108069892 + }, + { + "step": 3500, + "valid accuracy": 0.28, + "train loss": 0.780832305431366, + "train samples": 14000, + "train time": 45.84015418085619, + "eval time": 15.86955204399419, + "tokens / sec": 4575.6826901685245, + "mem allocated avg": 7078824071.168, + "mem reserved avg": 15300871323.648, + "elapsed time": 1600.7413567879994 + }, + { + "step": 3750, + "valid accuracy": 0.32, + "train loss": 0.7758122501373291, + "train samples": 15000, + "train time": 46.99727132692351, + "eval time": 15.8490629579901, + "tokens / sec": 4610.969826153641, + "mem allocated avg": 7089586788.352, + "mem reserved avg": 15444173914.112, + "elapsed time": 1716.2785189549904 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.7912874612808227, + "train samples": 16000, + "train time": 45.15887627698248, + "eval time": 15.855249352011015, + "tokens / sec": 4525.644056031772, + "mem allocated avg": 7071318118.4, + "mem reserved avg": 15188732411.904, + "elapsed time": 1829.5188424160006 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.7664959132671356, + "train samples": 17000, + "train time": 46.26589757904003, + "eval time": 15.853440922001028, + "tokens / sec": 4569.002463182864, + "mem allocated avg": 7081992153.088, + "mem reserved avg": 15327354159.104, + "elapsed time": 1944.2481972599926 + }, + { + "step": 4500, + "valid accuracy": 0.34, + "train loss": 0.7785169410705567, + "train samples": 18000, + "train time": 45.61058669183694, + "eval time": 15.866839458991308, + "tokens / sec": 4556.354457882774, + "mem allocated avg": 7075963725.824, + "mem reserved avg": 15250623561.728, + "elapsed time": 2058.0909812989994 + }, + { + "step": 4750, + "valid accuracy": 0.32, + "train loss": 0.7709811532497406, + "train samples": 19000, + "train time": 45.832340708962874, + "eval time": 15.847010081997723, + "tokens / sec": 4580.586475674911, + "mem allocated avg": 7079141249.024, + "mem reserved avg": 15295871713.28, + "elapsed time": 2172.3217773149954 + }, + { + "step": 5000, + "valid accuracy": 0.3, + "train loss": 0.7790318930149078, + "train samples": 20000, + "train time": 44.844002045996604, + "eval time": 15.846091532002902, + "tokens / sec": 4644.545323728393, + "mem allocated avg": 7075675734.016, + "mem reserved avg": 15251831521.28, + "elapsed time": 2285.3788618499966 + }, + { + "step": 5000, + "test accuracy": 0.25246398786959817, + "train loss": 0.7790318930149078, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/ptuning--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/ptuning--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..5ad6db218199541d2a426111594958057bc70943 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/ptuning--llama-3.2-3B-default.json @@ -0,0 +1,348 @@ +{ + "run_info": { + "created_at": "2025-06-19T19:48:53+00:00", + "total_time": 1918.2703526590012, + "experiment_name": "ptuning/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "P_TUNING", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "num_virtual_tokens": 20, + "token_dim": 3072, + "num_transformer_submodules": 1, + "num_attention_heads": 24, + "num_layers": 28, + "encoder_reparameterization_type": "MLP", + "encoder_hidden_size": 3072, + "encoder_num_layers": 2, + "encoder_dropout": 0.0 + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11867101593, + "accelerator_memory_max": 20937965568, + "accelerator_memory_reserved_99th": 17215688540, + "train_time": 1707.340225783013, + "file_size": 245880, + "num_trainable_params": 28382208, + "num_total_params": 3241132032, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.06, + "train loss": 0.9461167964935303, + "train samples": 1000, + "train time": 29.476242057011405, + "eval time": 11.075081511000462, + "tokens / sec": 7182.699870305862, + "mem allocated avg": 7263395393.536, + "mem reserved avg": 11910330187.776, + "elapsed time": 89.09710205499869 + }, + { + "step": 500, + "valid accuracy": 0.3, + "train loss": 0.7913461194038391, + "train samples": 2000, + "train time": 28.956617519994325, + "eval time": 11.047425028998987, + "tokens / sec": 7182.986751003671, + "mem allocated avg": 7255670497.28, + "mem reserved avg": 11810254094.336, + "elapsed time": 171.9022758780011 + }, + { + "step": 750, + "valid accuracy": 0.26, + "train loss": 0.7562740923166275, + "train samples": 3000, + "train time": 29.73533859500094, + "eval time": 11.056799476999004, + "tokens / sec": 7210.309689765724, + "mem allocated avg": 7266187038.72, + "mem reserved avg": 11954009669.632, + "elapsed time": 255.8485612350014 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7289484927654266, + "train samples": 4000, + "train time": 29.176458327034197, + "eval time": 11.069810884997423, + "tokens / sec": 7140.551387861937, + "mem allocated avg": 7258589235.2, + "mem reserved avg": 11838347542.528, + "elapsed time": 338.5030210529985 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.7231850942373276, + "train samples": 5000, + "train time": 29.15449026899296, + "eval time": 11.055301014999714, + "tokens / sec": 7152.860436794844, + "mem allocated avg": 7257714087.936, + "mem reserved avg": 11824925769.728, + "elapsed time": 421.85765765199903 + }, + { + "step": 1500, + "valid accuracy": 0.38, + "train loss": 0.711922277212143, + "train samples": 6000, + "train time": 29.099172437985544, + "eval time": 11.07098460600173, + "tokens / sec": 7193.709733364892, + "mem allocated avg": 7259322730.496, + "mem reserved avg": 11860233420.8, + "elapsed time": 504.97678817400083 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.7051182547807694, + "train samples": 7000, + "train time": 29.301267419017677, + "eval time": 11.044947161997698, + "tokens / sec": 7144.912778213831, + "mem allocated avg": 7260392302.592, + "mem reserved avg": 11872371736.576, + "elapsed time": 588.3443257949984 + }, + { + "step": 2000, + "valid accuracy": 0.38, + "train loss": 0.7055468891859055, + "train samples": 8000, + "train time": 29.128185330951965, + "eval time": 11.045154800998716, + "tokens / sec": 7130.413296955362, + "mem allocated avg": 7257253203.968, + "mem reserved avg": 11821100564.48, + "elapsed time": 671.2971968860002 + }, + { + "step": 2250, + "valid accuracy": 0.3, + "train loss": 0.699348534822464, + "train samples": 9000, + "train time": 29.44214156106318, + "eval time": 11.039785496999684, + "tokens / sec": 7300.691750095574, + "mem allocated avg": 7268387997.696, + "mem reserved avg": 11993788448.768, + "elapsed time": 755.1838785660002 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.6970288401842117, + "train samples": 10000, + "train time": 28.56064905500898, + "eval time": 11.062792377000733, + "tokens / sec": 7211.565801718971, + "mem allocated avg": 7253500915.712, + "mem reserved avg": 11774535401.472, + "elapsed time": 837.4507786270005 + }, + { + "step": 2750, + "valid accuracy": 0.38, + "train loss": 0.6885807738304138, + "train samples": 11000, + "train time": 29.626391561985656, + "eval time": 11.040969151999889, + "tokens / sec": 7151.765329121947, + "mem allocated avg": 7264164755.456, + "mem reserved avg": 11929330384.896, + "elapsed time": 921.4017121549987 + }, + { + "step": 3000, + "valid accuracy": 0.32, + "train loss": 0.6827223267555237, + "train samples": 12000, + "train time": 29.296160228008375, + "eval time": 11.056816091997462, + "tokens / sec": 7124.85862909926, + "mem allocated avg": 7259324233.728, + "mem reserved avg": 11842046918.656, + "elapsed time": 1004.5840267519998 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6894591153860092, + "train samples": 13000, + "train time": 29.611147850035195, + "eval time": 11.049655115999485, + "tokens / sec": 7122.351388338677, + "mem allocated avg": 7259635709.952, + "mem reserved avg": 11876809310.208, + "elapsed time": 1088.4846693049985 + }, + { + "step": 3500, + "valid accuracy": 0.42, + "train loss": 0.6757243422269821, + "train samples": 14000, + "train time": 28.982272775025194, + "eval time": 8.037888349997957, + "tokens / sec": 7237.182591861713, + "mem allocated avg": 7260029884.416, + "mem reserved avg": 11864100569.088, + "elapsed time": 1168.5907526180017 + }, + { + "step": 3750, + "valid accuracy": 0.44, + "train loss": 0.6726652181148529, + "train samples": 15000, + "train time": 29.461453213014465, + "eval time": 11.036738884999068, + "tokens / sec": 7355.475591552708, + "mem allocated avg": 7270358327.296, + "mem reserved avg": 12018115411.968, + "elapsed time": 1252.6760096750004 + }, + { + "step": 4000, + "valid accuracy": 0.44, + "train loss": 0.6872537672519684, + "train samples": 16000, + "train time": 28.49340438899526, + "eval time": 11.04012111100019, + "tokens / sec": 7172.642384527876, + "mem allocated avg": 7252451676.16, + "mem reserved avg": 11753454829.568, + "elapsed time": 1334.9961819890013 + }, + { + "step": 4250, + "valid accuracy": 0.46, + "train loss": 0.6691881531476974, + "train samples": 17000, + "train time": 29.36704957404436, + "eval time": 11.048986494999554, + "tokens / sec": 7198.169481309866, + "mem allocated avg": 7262467567.616, + "mem reserved avg": 11896405098.496, + "elapsed time": 1418.9507249929993 + }, + { + "step": 4500, + "valid accuracy": 0.5, + "train loss": 0.6769082483053207, + "train samples": 18000, + "train time": 29.086171291994106, + "eval time": 8.132250926999404, + "tokens / sec": 7144.907382746569, + "mem allocated avg": 7257195100.16, + "mem reserved avg": 11816553938.944, + "elapsed time": 1499.0322536989988 + }, + { + "step": 4750, + "valid accuracy": 0.46, + "train loss": 0.6686601461172104, + "train samples": 19000, + "train time": 29.45103387799827, + "eval time": 7.564945229998557, + "tokens / sec": 7128.408492200246, + "mem allocated avg": 7260019183.616, + "mem reserved avg": 11863848910.848, + "elapsed time": 1579.1494789060016 + }, + { + "step": 5000, + "valid accuracy": 0.48, + "train loss": 0.6739867876768112, + "train samples": 20000, + "train time": 29.24236888399173, + "eval time": 6.952750485001161, + "tokens / sec": 7122.541980995923, + "mem allocated avg": 7256318291.968, + "mem reserved avg": 11821469663.232, + "elapsed time": 1658.0220765080012 + }, + { + "step": 5000, + "test accuracy": 0.3707354056103108, + "train loss": 0.6739867876768112, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/randlora--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/randlora--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..1025019a88057969667507acf0070f182c663d90 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/randlora--llama-3.2-3B-default.json @@ -0,0 +1,356 @@ +{ + "run_info": { + "created_at": "2025-06-20T07:20:24+00:00", + "total_time": 2457.3893872150074, + "experiment_name": "randlora/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "RANDLORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "target_modules": [ + "v_proj", + "q_proj" + ], + "projection_prng_key": 0, + "save_projection": true, + "sparse": false, + "very_sparse": false, + "randlora_dropout": 0.0, + "fan_in_fan_out": false, + "randlora_alpha": 640, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12743670025, + "accelerator_memory_max": 22798139392, + "accelerator_memory_reserved_99th": 18436063232, + "train_time": 2213.072415724004, + "file_size": 2211281240, + "num_trainable_params": 9289728, + "num_total_params": 3222039552, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.38, + "train loss": 0.9159075767993927, + "train samples": 1000, + "train time": 50.62416129904159, + "eval time": 13.32173753400275, + "tokens / sec": 4182.172989481373, + "mem allocated avg": 6983776778.24, + "mem reserved avg": 12791771561.984, + "elapsed time": 114.85611474100733 + }, + { + "step": 500, + "valid accuracy": 0.34, + "train loss": 0.7009325810670852, + "train samples": 2000, + "train time": 49.47734279213182, + "eval time": 13.318595108998124, + "tokens / sec": 4203.843380875268, + "mem allocated avg": 6975756310.528, + "mem reserved avg": 12690437177.344, + "elapsed time": 222.717683150011 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.6809726172685623, + "train samples": 3000, + "train time": 50.701564677088754, + "eval time": 6.592474952994962, + "tokens / sec": 4228.6860645325305, + "mem allocated avg": 6985956540.416, + "mem reserved avg": 12840031223.808, + "elapsed time": 325.2694208340108 + }, + { + "step": 1000, + "valid accuracy": 0.32, + "train loss": 0.6661903276443482, + "train samples": 4000, + "train time": 49.452677299879724, + "eval time": 13.326040301006287, + "tokens / sec": 4212.835611238114, + "mem allocated avg": 6977344550.912, + "mem reserved avg": 12711484194.816, + "elapsed time": 432.82023598300293 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.665697453379631, + "train samples": 5000, + "train time": 49.56871296803001, + "eval time": 6.698036557994783, + "tokens / sec": 4207.0489127789, + "mem allocated avg": 6977509738.496, + "mem reserved avg": 12708397187.072, + "elapsed time": 534.2725243740133 + }, + { + "step": 1500, + "valid accuracy": 0.44, + "train loss": 0.658678293466568, + "train samples": 6000, + "train time": 49.71162069692218, + "eval time": 13.42559558400535, + "tokens / sec": 4210.906767176883, + "mem allocated avg": 6978434217.984, + "mem reserved avg": 12733680451.584, + "elapsed time": 642.8949007330084 + }, + { + "step": 1750, + "valid accuracy": 0.44, + "train loss": 0.6513392345905304, + "train samples": 7000, + "train time": 49.957065908936784, + "eval time": 8.692238900999655, + "tokens / sec": 4190.698476600257, + "mem allocated avg": 6980155148.288, + "mem reserved avg": 12746875731.968, + "elapsed time": 746.9297674360132 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.6511732361316681, + "train samples": 8000, + "train time": 49.75638979690848, + "eval time": 13.350251003997982, + "tokens / sec": 4174.257835983607, + "mem allocated avg": 6976487055.36, + "mem reserved avg": 12692744044.544, + "elapsed time": 855.1831161730079 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.6382467728853226, + "train samples": 9000, + "train time": 51.20128064297023, + "eval time": 13.277926524999202, + "tokens / sec": 4198.098119827237, + "mem allocated avg": 6988260448.256, + "mem reserved avg": 12868644765.696, + "elapsed time": 965.4057929810078 + }, + { + "step": 2500, + "valid accuracy": 0.42, + "train loss": 0.6324679807424546, + "train samples": 10000, + "train time": 47.79617171884456, + "eval time": 13.268003197008511, + "tokens / sec": 4309.278182603765, + "mem allocated avg": 6973276801.024, + "mem reserved avg": 12640810172.416, + "elapsed time": 1071.8066454930085 + }, + { + "step": 2750, + "valid accuracy": 0.42, + "train loss": 0.6214727911949157, + "train samples": 11000, + "train time": 49.63376283789694, + "eval time": 8.245235234993743, + "tokens / sec": 4268.888512281446, + "mem allocated avg": 6983764305.92, + "mem reserved avg": 12802987130.88, + "elapsed time": 1175.2128759590123 + }, + { + "step": 3000, + "valid accuracy": 0.46, + "train loss": 0.6079807863235474, + "train samples": 12000, + "train time": 49.776777152961586, + "eval time": 13.29031453501375, + "tokens / sec": 4193.340990289104, + "mem allocated avg": 6978680711.168, + "mem reserved avg": 12727900700.672, + "elapsed time": 1283.6379908250092 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.6133705099821091, + "train samples": 13000, + "train time": 50.014745363077964, + "eval time": 7.092912267995416, + "tokens / sec": 4216.77644200688, + "mem allocated avg": 6980747913.216, + "mem reserved avg": 12754257707.008, + "elapsed time": 1386.1836155580095 + }, + { + "step": 3500, + "valid accuracy": 0.52, + "train loss": 0.5912622555494308, + "train samples": 14000, + "train time": 49.560089439110016, + "eval time": 13.321606318990234, + "tokens / sec": 4232.236107194697, + "mem allocated avg": 6979099045.888, + "mem reserved avg": 12738579398.656, + "elapsed time": 1494.848658177012 + }, + { + "step": 3750, + "valid accuracy": 0.48, + "train loss": 0.5849999967813492, + "train samples": 15000, + "train time": 51.10861245104752, + "eval time": 13.350408840997261, + "tokens / sec": 4240.048586870968, + "mem allocated avg": 6990205292.544, + "mem reserved avg": 12906016014.336, + "elapsed time": 1605.1250539940083 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5914600425958634, + "train samples": 16000, + "train time": 48.92153307204717, + "eval time": 13.309209176004515, + "tokens / sec": 4177.567364845621, + "mem allocated avg": 6971749750.784, + "mem reserved avg": 12621868695.552, + "elapsed time": 1712.7276146870136 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.575433883190155, + "train samples": 17000, + "train time": 50.056106529867975, + "eval time": 13.322275185011677, + "tokens / sec": 4223.041196259767, + "mem allocated avg": 6981706383.36, + "mem reserved avg": 12772226105.344, + "elapsed time": 1821.8377219670074 + }, + { + "step": 4500, + "valid accuracy": 0.48, + "train loss": 0.5807004086971282, + "train samples": 18000, + "train time": 49.559131018977496, + "eval time": 13.371259606996318, + "tokens / sec": 4193.334219690434, + "mem allocated avg": 6976847835.136, + "mem reserved avg": 12694061056.0, + "elapsed time": 1929.8977656790084 + }, + { + "step": 4750, + "valid accuracy": 0.52, + "train loss": 0.5704656873941422, + "train samples": 19000, + "train time": 49.80019182183605, + "eval time": 13.346957685993402, + "tokens / sec": 4215.62633234572, + "mem allocated avg": 6979789905.92, + "mem reserved avg": 12742303940.608, + "elapsed time": 2038.7162974260136 + }, + { + "step": 5000, + "valid accuracy": 0.52, + "train loss": 0.5784689987897873, + "train samples": 20000, + "train time": 49.38921916205436, + "eval time": 13.307282750000013, + "tokens / sec": 4217.114656471855, + "mem allocated avg": 6976297842.688, + "mem reserved avg": 12688323248.128, + "elapsed time": 2146.6737093550037 + }, + { + "step": 5000, + "test accuracy": 0.5072024260803639, + "train loss": 0.5784689987897873, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/shira--llama-3.2-3B-lr_0.0003-random_seed_42.json b/peft/method_comparison/MetaMathQA/results/shira--llama-3.2-3B-lr_0.0003-random_seed_42.json new file mode 100644 index 0000000000000000000000000000000000000000..1263cd479fe1109dc2001db4d0431c99cfc5b36c --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/shira--llama-3.2-3B-lr_0.0003-random_seed_42.json @@ -0,0 +1,348 @@ +{ + "run_info": { + "created_at": "2025-07-31T14:52:50+00:00", + "total_time": 2084.7194732099997, + "experiment_name": "shira/llama-3.2-3B-lr_0.0003-random_seed_42", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0003 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "SHIRA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 32, + "mask_type": "random", + "random_seed": 42, + "target_modules": [ + "v_proj", + "q_proj" + ], + "fan_in_fan_out": false, + "init_weights": true, + "modules_to_save": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12240924809, + "accelerator_memory_max": 21743271936, + "accelerator_memory_reserved_99th": 17637383864, + "train_time": 1867.0518525470034, + "file_size": 110115520, + "num_trainable_params": 9175040, + "num_total_params": 3221924864, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.38, + "train loss": 0.9357188057899475, + "train samples": 1000, + "train time": 31.692333374005102, + "eval time": 12.883808001000943, + "tokens / sec": 6680.448470028325, + "mem allocated avg": 6994551982.08, + "mem reserved avg": 12283740684.288, + "elapsed time": 96.08096698400004 + }, + { + "step": 500, + "valid accuracy": 0.36, + "train loss": 0.7029980063438416, + "train samples": 2000, + "train time": 30.742395647013836, + "eval time": 12.860161713999332, + "tokens / sec": 6765.73818085656, + "mem allocated avg": 6987138160.64, + "mem reserved avg": 12187783397.376, + "elapsed time": 185.3151257690006 + }, + { + "step": 750, + "valid accuracy": 0.46, + "train loss": 0.6642508600950241, + "train samples": 3000, + "train time": 31.300478177990954, + "eval time": 10.545964175000336, + "tokens / sec": 6849.767558846972, + "mem allocated avg": 6997475934.208, + "mem reserved avg": 12327495663.616, + "elapsed time": 273.0391829120017 + }, + { + "step": 1000, + "valid accuracy": 0.36, + "train loss": 0.6439496507644653, + "train samples": 4000, + "train time": 31.05783393001184, + "eval time": 12.813238828999602, + "tokens / sec": 6708.00160981866, + "mem allocated avg": 6989375735.808, + "mem reserved avg": 12216002674.688, + "elapsed time": 362.4678097830001 + }, + { + "step": 1250, + "valid accuracy": 0.32, + "train loss": 0.6386180140972137, + "train samples": 5000, + "train time": 31.162159604989938, + "eval time": 9.70755834100055, + "tokens / sec": 6692.026568229476, + "mem allocated avg": 6988624240.64, + "mem reserved avg": 12208931078.144, + "elapsed time": 449.31565678900006 + }, + { + "step": 1500, + "valid accuracy": 0.5, + "train loss": 0.6296385749578476, + "train samples": 6000, + "train time": 31.37457448001078, + "eval time": 12.791106022999884, + "tokens / sec": 6671.994870667266, + "mem allocated avg": 6990819698.688, + "mem reserved avg": 12231488045.056, + "elapsed time": 539.3741775020007 + }, + { + "step": 1750, + "valid accuracy": 0.52, + "train loss": 0.6209055181741715, + "train samples": 7000, + "train time": 31.12063506498089, + "eval time": 11.802694226998938, + "tokens / sec": 6727.208476397092, + "mem allocated avg": 6991199850.496, + "mem reserved avg": 12244775600.128, + "elapsed time": 628.1649310410012 + }, + { + "step": 2000, + "valid accuracy": 0.42, + "train loss": 0.622630435705185, + "train samples": 8000, + "train time": 30.90304242698403, + "eval time": 12.802878865999446, + "tokens / sec": 6720.8916562416925, + "mem allocated avg": 6988364103.68, + "mem reserved avg": 12193303101.44, + "elapsed time": 717.7866772650013 + }, + { + "step": 2250, + "valid accuracy": 0.5, + "train loss": 0.6117782632112503, + "train samples": 9000, + "train time": 31.835284770990256, + "eval time": 12.896296490000168, + "tokens / sec": 6751.8792919945945, + "mem allocated avg": 6999611363.328, + "mem reserved avg": 12359498203.136, + "elapsed time": 808.9164720260014 + }, + { + "step": 2500, + "valid accuracy": 0.48, + "train loss": 0.6087703567743301, + "train samples": 10000, + "train time": 31.106087331994786, + "eval time": 8.24222338599975, + "tokens / sec": 6621.437077627842, + "mem allocated avg": 6984549638.144, + "mem reserved avg": 12142711406.592, + "elapsed time": 894.0944889150014 + }, + { + "step": 2750, + "valid accuracy": 0.54, + "train loss": 0.5988683942556381, + "train samples": 11000, + "train time": 31.549549333021787, + "eval time": 10.239751285998864, + "tokens / sec": 6715.817007827485, + "mem allocated avg": 6995372679.168, + "mem reserved avg": 12298949230.592, + "elapsed time": 981.9011422450003 + }, + { + "step": 3000, + "valid accuracy": 0.5, + "train loss": 0.5887085427045822, + "train samples": 12000, + "train time": 30.952114122006606, + "eval time": 6.788169478999407, + "tokens / sec": 6743.6750580986845, + "mem allocated avg": 6990271899.648, + "mem reserved avg": 12226782035.968, + "elapsed time": 1065.3513825200007 + }, + { + "step": 3250, + "valid accuracy": 0.5, + "train loss": 0.5973232421875, + "train samples": 13000, + "train time": 31.107180875995255, + "eval time": 8.763070525999865, + "tokens / sec": 6779.81720171717, + "mem allocated avg": 6991821465.6, + "mem reserved avg": 12247829053.44, + "elapsed time": 1151.3527770540004 + }, + { + "step": 3500, + "valid accuracy": 0.58, + "train loss": 0.5808243087530136, + "train samples": 14000, + "train time": 31.351620633020502, + "eval time": 12.858672252999895, + "tokens / sec": 6690.244260581693, + "mem allocated avg": 6991037339.648, + "mem reserved avg": 12237720780.8, + "elapsed time": 1241.8340592570003 + }, + { + "step": 3750, + "valid accuracy": 0.54, + "train loss": 0.5781804740428924, + "train samples": 15000, + "train time": 31.949568995005393, + "eval time": 10.286192837000272, + "tokens / sec": 6782.658008121384, + "mem allocated avg": 7002176088.064, + "mem reserved avg": 12393589506.048, + "elapsed time": 1330.699673422001 + }, + { + "step": 4000, + "valid accuracy": 0.52, + "train loss": 0.5873791750669479, + "train samples": 16000, + "train time": 30.883606043998952, + "eval time": 12.840774923999561, + "tokens / sec": 6617.523863917831, + "mem allocated avg": 6983247890.432, + "mem reserved avg": 12128861814.784, + "elapsed time": 1420.291728133001 + }, + { + "step": 4250, + "valid accuracy": 0.54, + "train loss": 0.5750357346534729, + "train samples": 17000, + "train time": 31.41448626901547, + "eval time": 7.603731496999899, + "tokens / sec": 6729.029346199935, + "mem allocated avg": 6993511036.928, + "mem reserved avg": 12266485317.632, + "elapsed time": 1505.506280988 + }, + { + "step": 4500, + "valid accuracy": 0.54, + "train loss": 0.5816998761892319, + "train samples": 18000, + "train time": 30.81339496200053, + "eval time": 6.927671567998914, + "tokens / sec": 6744.404511618528, + "mem allocated avg": 6988376672.256, + "mem reserved avg": 12194603335.68, + "elapsed time": 1588.958452021001 + }, + { + "step": 4750, + "valid accuracy": 0.58, + "train loss": 0.5723758825063705, + "train samples": 19000, + "train time": 31.16517290099182, + "eval time": 7.508142915999997, + "tokens / sec": 6736.333556272963, + "mem allocated avg": 6991473983.488, + "mem reserved avg": 12240195420.16, + "elapsed time": 1673.9547475400013 + }, + { + "step": 5000, + "valid accuracy": 0.54, + "train loss": 0.5789329997301101, + "train samples": 20000, + "train time": 31.123193277984683, + "eval time": 7.151714429001004, + "tokens / sec": 6692.115366816459, + "mem allocated avg": 6987954006.016, + "mem reserved avg": 12187749842.944, + "elapsed time": 1758.2295099830008 + }, + { + "step": 5000, + "test accuracy": 0.5072024260803639, + "train loss": 0.5789329997301101, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.16.1.dev0", + "peft-commit-hash": "25e5c6b25c4589eb2683484ede1ba3d985d8a760", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1031-aws", + "version": "#33-Ubuntu SMP Fri Jun 20 18:11:07 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/trainable_tokens--llama-3.2-3B-sos+eos.json b/peft/method_comparison/MetaMathQA/results/trainable_tokens--llama-3.2-3B-sos+eos.json new file mode 100644 index 0000000000000000000000000000000000000000..eff36db4db861519316e19d0696797c8eb04da97 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/trainable_tokens--llama-3.2-3B-sos+eos.json @@ -0,0 +1,344 @@ +{ + "run_info": { + "created_at": "2025-07-31T11:51:03+00:00", + "total_time": 1813.5205606600002, + "experiment_name": "trainable_tokens/llama-3.2-3B-sos+eos", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.2 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": "CAUSAL_LM", + "peft_type": "TRAINABLE_TOKENS", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "token_indices": [ + 128000, + 128001 + ], + "target_modules": "model.embed_tokens", + "init_weights": true + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 12730137942, + "accelerator_memory_max": 20956839936, + "accelerator_memory_reserved_99th": 16957675929, + "train_time": 1571.9034050299992, + "file_size": 49424, + "num_trainable_params": 6144, + "num_total_params": 3212755968, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.2, + "train loss": 0.9600473783016205, + "train samples": 1000, + "train time": 28.046887916000458, + "eval time": 11.940005464000023, + "tokens / sec": 7548.751955443032, + "mem allocated avg": 6772620728.32, + "mem reserved avg": 12816987717.632, + "elapsed time": 79.72835956 + }, + { + "step": 500, + "valid accuracy": 0.1, + "train loss": 0.9827968046665192, + "train samples": 2000, + "train time": 27.568676805000052, + "eval time": 11.946740159, + "tokens / sec": 7544.613093736749, + "mem allocated avg": 6765130303.488, + "mem reserved avg": 12640885669.888, + "elapsed time": 153.151956972 + }, + { + "step": 750, + "valid accuracy": 0.26, + "train loss": 1.078141197681427, + "train samples": 3000, + "train time": 28.346493393999594, + "eval time": 11.913883551000026, + "tokens / sec": 7563.581040517151, + "mem allocated avg": 6775356669.952, + "mem reserved avg": 12829595795.456, + "elapsed time": 227.42617082600003 + }, + { + "step": 1000, + "valid accuracy": 0.18, + "train loss": 0.9337297527790069, + "train samples": 4000, + "train time": 27.80469767599925, + "eval time": 11.935028867000028, + "tokens / sec": 7492.834571613906, + "mem allocated avg": 6767140114.432, + "mem reserved avg": 12714495705.088, + "elapsed time": 301.119469732 + }, + { + "step": 1250, + "valid accuracy": 0.06, + "train loss": 1.1530333435535431, + "train samples": 5000, + "train time": 27.832769999000448, + "eval time": 11.899406999999997, + "tokens / sec": 7492.534879118722, + "mem allocated avg": 6766889218.048, + "mem reserved avg": 12706635579.392, + "elapsed time": 374.949399381 + }, + { + "step": 1500, + "valid accuracy": 0.16, + "train loss": 0.9257006254196167, + "train samples": 6000, + "train time": 27.848633210000685, + "eval time": 11.963534283000058, + "tokens / sec": 7516.7423270463205, + "mem allocated avg": 6768338049.024, + "mem reserved avg": 12713531015.168, + "elapsed time": 448.64371043399996 + }, + { + "step": 1750, + "valid accuracy": 0.04, + "train loss": 0.9480950233936309, + "train samples": 7000, + "train time": 28.10500999800047, + "eval time": 11.907140037999966, + "tokens / sec": 7449.027771735166, + "mem allocated avg": 6769886017.536, + "mem reserved avg": 12724176158.72, + "elapsed time": 523.0294692360001 + }, + { + "step": 2000, + "valid accuracy": 0.22, + "train loss": 1.099897492647171, + "train samples": 8000, + "train time": 27.50086192600054, + "eval time": 11.908490246000042, + "tokens / sec": 7552.345106814087, + "mem allocated avg": 6767001008.128, + "mem reserved avg": 12671663472.64, + "elapsed time": 596.5166793630001 + }, + { + "step": 2250, + "valid accuracy": 0.2, + "train loss": 1.0596771531105043, + "train samples": 9000, + "train time": 28.679387931000633, + "eval time": 11.905819370000017, + "tokens / sec": 7494.860089662325, + "mem allocated avg": 6777668958.208, + "mem reserved avg": 12765641048.064, + "elapsed time": 671.4013165209999 + }, + { + "step": 2500, + "valid accuracy": 0.28, + "train loss": 0.8838931715488434, + "train samples": 10000, + "train time": 27.525735084001553, + "eval time": 11.918955123999922, + "tokens / sec": 7482.70661515273, + "mem allocated avg": 6762954442.752, + "mem reserved avg": 12636850749.44, + "elapsed time": 744.8136404429999 + }, + { + "step": 2750, + "valid accuracy": 0.28, + "train loss": 0.8453443908691406, + "train samples": 11000, + "train time": 28.56140573299922, + "eval time": 11.910845225000003, + "tokens / sec": 7418.437382975074, + "mem allocated avg": 6773394008.064, + "mem reserved avg": 12781445185.536, + "elapsed time": 819.4217107059999 + }, + { + "step": 3000, + "valid accuracy": 0.28, + "train loss": 0.8956673395633697, + "train samples": 12000, + "train time": 27.520784680998986, + "eval time": 11.922119511000119, + "tokens / sec": 7584.485777548084, + "mem allocated avg": 6768260833.28, + "mem reserved avg": 12729855246.336, + "elapsed time": 892.9293128969998 + }, + { + "step": 3250, + "valid accuracy": 0.28, + "train loss": 0.8384639675617218, + "train samples": 13000, + "train time": 27.425537425002176, + "eval time": 11.90852308500007, + "tokens / sec": 7689.9495799026545, + "mem allocated avg": 6769824622.592, + "mem reserved avg": 12732145336.32, + "elapsed time": 966.4361498369999 + }, + { + "step": 3500, + "valid accuracy": 0.28, + "train loss": 0.8318528242111206, + "train samples": 14000, + "train time": 27.599476771002628, + "eval time": 11.926854094000191, + "tokens / sec": 7599.781754572018, + "mem allocated avg": 6769281437.696, + "mem reserved avg": 12748083691.52, + "elapsed time": 1040.349463558 + }, + { + "step": 3750, + "valid accuracy": 0.3, + "train loss": 0.825278183221817, + "train samples": 15000, + "train time": 28.668002616999956, + "eval time": 11.920088525999972, + "tokens / sec": 7559.054702733158, + "mem allocated avg": 6780301985.792, + "mem reserved avg": 12867151593.472, + "elapsed time": 1115.464965257 + }, + { + "step": 4000, + "valid accuracy": 0.34, + "train loss": 0.8279166626930237, + "train samples": 16000, + "train time": 27.677960625997912, + "eval time": 11.956045786999994, + "tokens / sec": 7383.961656771504, + "mem allocated avg": 6761672556.544, + "mem reserved avg": 12607280906.24, + "elapsed time": 1189.311786065 + }, + { + "step": 4250, + "valid accuracy": 0.26, + "train loss": 0.7829471210241318, + "train samples": 17000, + "train time": 27.69402594300027, + "eval time": 11.900985279999986, + "tokens / sec": 7633.018053607661, + "mem allocated avg": 6771771314.176, + "mem reserved avg": 12787006832.64, + "elapsed time": 1263.137991501 + }, + { + "step": 4500, + "valid accuracy": 0.36, + "train loss": 0.788821615934372, + "train samples": 18000, + "train time": 27.622178668999823, + "eval time": 11.941970926000067, + "tokens / sec": 7523.591911062131, + "mem allocated avg": 6766746431.488, + "mem reserved avg": 12679666204.672, + "elapsed time": 1336.699760115 + }, + { + "step": 4750, + "valid accuracy": 0.3, + "train loss": 0.7817396788597107, + "train samples": 19000, + "train time": 28.053782109000622, + "eval time": 11.908781481000005, + "tokens / sec": 7483.447300770342, + "mem allocated avg": 6769035872.256, + "mem reserved avg": 12704303546.368, + "elapsed time": 1411.011856929 + }, + { + "step": 5000, + "valid accuracy": 0.32, + "train loss": 0.786205664396286, + "train samples": 20000, + "train time": 27.865934093999613, + "eval time": 11.930896392000022, + "tokens / sec": 7474.359169063313, + "mem allocated avg": 6766191118.336, + "mem reserved avg": 12745357393.92, + "elapsed time": 1484.8831304389998 + }, + { + "step": 5000, + "test accuracy": 0.2880970432145565, + "train loss": 0.786205664396286, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.16.1.dev0", + "peft-commit-hash": "25e5c6b25c4589eb2683484ede1ba3d985d8a760", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1031-aws", + "version": "#33-Ubuntu SMP Fri Jun 20 18:11:07 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} \ No newline at end of file diff --git a/peft/method_comparison/MetaMathQA/results/vblora--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/vblora--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..ccf041765e8ee3c837eefb23eadba2249638e909 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/vblora--llama-3.2-3B-default.json @@ -0,0 +1,357 @@ +{ + "run_info": { + "created_at": "2025-06-19T23:49:12+00:00", + "total_time": 2210.184595478997, + "experiment_name": "vblora/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.0001, + "weight_decay": 0.1 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "VBLORA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 4, + "num_vectors": 256, + "vector_length": 256, + "topk": 2, + "target_modules": [ + "v_proj", + "q_proj" + ], + "exclude_modules": null, + "save_only_topk_weights": false, + "vblora_dropout": 0.0, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_vector_bank_bound": 0.02, + "init_logits_std": 0.1, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11735344663, + "accelerator_memory_max": 22181576704, + "accelerator_memory_reserved_99th": 17635223797, + "train_time": 1961.761054087001, + "file_size": 4864912, + "num_trainable_params": 1212416, + "num_total_params": 3213962240, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.308416832447052, + "train samples": 1000, + "train time": 40.12101128498034, + "eval time": 12.847303112997906, + "tokens / sec": 5277.010554299236, + "mem allocated avg": 6798303909.888, + "mem reserved avg": 11786547888.128, + "elapsed time": 101.0704645309961 + }, + { + "step": 500, + "valid accuracy": 0.38, + "train loss": 1.0353211228847503, + "train samples": 2000, + "train time": 39.44148263899842, + "eval time": 12.918682472998626, + "tokens / sec": 5273.508653408011, + "mem allocated avg": 6790843463.68, + "mem reserved avg": 11678368399.36, + "elapsed time": 195.96383863499796 + }, + { + "step": 750, + "valid accuracy": 0.3, + "train loss": 0.8149400608539581, + "train samples": 3000, + "train time": 39.73428049797076, + "eval time": 12.816220013999555, + "tokens / sec": 5395.869695210651, + "mem allocated avg": 6801506754.56, + "mem reserved avg": 11833448595.456, + "elapsed time": 291.3562671039981 + }, + { + "step": 1000, + "valid accuracy": 0.34, + "train loss": 0.766725031375885, + "train samples": 4000, + "train time": 39.66955411599338, + "eval time": 12.954798815000686, + "tokens / sec": 5251.785774824381, + "mem allocated avg": 6791902521.344, + "mem reserved avg": 11700715651.072, + "elapsed time": 386.48246048299916 + }, + { + "step": 1250, + "valid accuracy": 0.4, + "train loss": 0.7548577107191086, + "train samples": 5000, + "train time": 39.58740361595119, + "eval time": 12.921318173001055, + "tokens / sec": 5267.786744063522, + "mem allocated avg": 6792241373.184, + "mem reserved avg": 11698584944.64, + "elapsed time": 481.6297270110008 + }, + { + "step": 1500, + "valid accuracy": 0.42, + "train loss": 0.744083244919777, + "train samples": 6000, + "train time": 39.58100679998461, + "eval time": 7.990361490003124, + "tokens / sec": 5288.672950079719, + "mem allocated avg": 6792975718.4, + "mem reserved avg": 11729908006.912, + "elapsed time": 571.7995823969977 + }, + { + "step": 1750, + "valid accuracy": 0.4, + "train loss": 0.7353366105556488, + "train samples": 7000, + "train time": 39.89848869200068, + "eval time": 9.013003496002057, + "tokens / sec": 5247.191231129864, + "mem allocated avg": 6795538806.784, + "mem reserved avg": 11737281593.344, + "elapsed time": 663.3418345429964 + }, + { + "step": 2000, + "valid accuracy": 0.36, + "train loss": 0.735884799003601, + "train samples": 8000, + "train time": 39.573877232054656, + "eval time": 13.005171182994673, + "tokens / sec": 5248.3106161699825, + "mem allocated avg": 6792414906.368, + "mem reserved avg": 11683577724.928, + "elapsed time": 758.2818646219966 + }, + { + "step": 2250, + "valid accuracy": 0.34, + "train loss": 0.7294247032403945, + "train samples": 9000, + "train time": 40.16309046502283, + "eval time": 12.827437616993848, + "tokens / sec": 5351.878989172747, + "mem allocated avg": 6803159742.464, + "mem reserved avg": 11872145244.16, + "elapsed time": 854.2901024749954 + }, + { + "step": 2500, + "valid accuracy": 0.36, + "train loss": 0.7273153622150421, + "train samples": 10000, + "train time": 39.40831322706072, + "eval time": 12.578817460002028, + "tokens / sec": 5226.48606686793, + "mem allocated avg": 6788682082.304, + "mem reserved avg": 11624815525.888, + "elapsed time": 948.6046375669976 + }, + { + "step": 2750, + "valid accuracy": 0.3, + "train loss": 0.7221734907627105, + "train samples": 11000, + "train time": 39.99460277392063, + "eval time": 12.831681943003787, + "tokens / sec": 5297.739827488966, + "mem allocated avg": 6798795204.608, + "mem reserved avg": 11800045158.4, + "elapsed time": 1044.2291650330008 + }, + { + "step": 3000, + "valid accuracy": 0.44, + "train loss": 0.7163265677690506, + "train samples": 12000, + "train time": 39.72457089692762, + "eval time": 9.721318816998973, + "tokens / sec": 5254.455750864856, + "mem allocated avg": 6794274019.328, + "mem reserved avg": 11717786468.352, + "elapsed time": 1136.276137433997 + }, + { + "step": 3250, + "valid accuracy": 0.24, + "train loss": 0.7239821909666061, + "train samples": 13000, + "train time": 39.57894092098286, + "eval time": 12.939295003001462, + "tokens / sec": 5328.616559524724, + "mem allocated avg": 6796102031.36, + "mem reserved avg": 11749923225.6, + "elapsed time": 1231.5789504099957 + }, + { + "step": 3500, + "valid accuracy": 0.3, + "train loss": 0.7123430745601654, + "train samples": 14000, + "train time": 39.774808847985696, + "eval time": 12.81972120499995, + "tokens / sec": 5273.438291096208, + "mem allocated avg": 6794877718.528, + "mem reserved avg": 11727257206.784, + "elapsed time": 1327.2042175199967 + }, + { + "step": 3750, + "valid accuracy": 0.32, + "train loss": 0.7080619329214096, + "train samples": 15000, + "train time": 40.429172475058294, + "eval time": 12.810948685997573, + "tokens / sec": 5360.065188910042, + "mem allocated avg": 6804612114.432, + "mem reserved avg": 11907847159.808, + "elapsed time": 1424.0900874009967 + }, + { + "step": 4000, + "valid accuracy": 0.42, + "train loss": 0.7257569855451584, + "train samples": 16000, + "train time": 39.64596449997771, + "eval time": 12.844434396996803, + "tokens / sec": 5154.950890401844, + "mem allocated avg": 6787030419.456, + "mem reserved avg": 11605689499.648, + "elapsed time": 1519.344677588997 + }, + { + "step": 4250, + "valid accuracy": 0.38, + "train loss": 0.7041294666528701, + "train samples": 17000, + "train time": 39.938396073041076, + "eval time": 12.83599700799823, + "tokens / sec": 5292.876549508964, + "mem allocated avg": 6797280624.64, + "mem reserved avg": 11765333098.496, + "elapsed time": 1614.829351510998 + }, + { + "step": 4500, + "valid accuracy": 0.38, + "train loss": 0.7148806138038635, + "train samples": 18000, + "train time": 39.55479707601626, + "eval time": 12.901207727001747, + "tokens / sec": 5253.926586972907, + "mem allocated avg": 6791958038.528, + "mem reserved avg": 11679299534.848, + "elapsed time": 1710.0793527279966 + }, + { + "step": 4750, + "valid accuracy": 0.32, + "train loss": 0.7083848255872727, + "train samples": 19000, + "train time": 39.88160159892141, + "eval time": 12.8585010780007, + "tokens / sec": 5264.056396513368, + "mem allocated avg": 6794248144.896, + "mem reserved avg": 11730923028.48, + "elapsed time": 1806.240128286001 + }, + { + "step": 5000, + "valid accuracy": 0.36, + "train loss": 0.7142883945703506, + "train samples": 20000, + "train time": 39.631631882970396, + "eval time": 12.818420095005422, + "tokens / sec": 5255.398026885119, + "mem allocated avg": 6791235981.312, + "mem reserved avg": 11677395320.832, + "elapsed time": 1901.3912653839943 + }, + { + "step": 5000, + "test accuracy": 0.36997725549658833, + "train loss": 0.7142883945703506, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/results/vera--llama-3.2-3B-default.json b/peft/method_comparison/MetaMathQA/results/vera--llama-3.2-3B-default.json new file mode 100644 index 0000000000000000000000000000000000000000..690c35072932a4de9167712c9a2634f92db6f29d --- /dev/null +++ b/peft/method_comparison/MetaMathQA/results/vera--llama-3.2-3B-default.json @@ -0,0 +1,353 @@ +{ + "run_info": { + "created_at": "2025-06-19T20:53:39+00:00", + "total_time": 2024.6820476150024, + "experiment_name": "vera/llama-3.2-3B-default", + "peft_branch": "main", + "train_config": { + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "bfloat16", + "max_seq_length": 768, + "batch_size": 4, + "batch_size_eval": 50, + "max_steps": 5000, + "eval_steps": 250, + "compile": false, + "query_template": "Question: {query} Think step by step.\nAnswer:", + "seed": 0, + "grad_norm_clip": 1.0, + "optimizer_type": "AdamW", + "optimizer_kwargs": { + "lr": 0.001 + }, + "lr_scheduler": "cosine", + "use_amp": false, + "autocast_adapter_dtype": true, + "generation_kwargs": { + "max_length": 800, + "max_new_tokens": 300 + }, + "attn_implementation": null + }, + "peft_config": { + "task_type": null, + "peft_type": "VERA", + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-3B", + "revision": null, + "inference_mode": false, + "r": 256, + "target_modules": [ + "v_proj", + "q_proj" + ], + "projection_prng_key": 0, + "save_projection": true, + "vera_dropout": 0.0, + "d_initial": 0.1, + "fan_in_fan_out": false, + "bias": "none", + "modules_to_save": null, + "init_weights": true, + "layers_to_transform": null, + "layers_pattern": null + }, + "error_msg": "" + }, + "train_info": { + "accelerator_memory_reserved_avg": 11489715316, + "accelerator_memory_max": 21596471296, + "accelerator_memory_reserved_99th": 17291123097, + "train_time": 1819.9693055349999, + "file_size": 6821968, + "num_trainable_params": 129024, + "num_total_params": 3212878848, + "status": "success", + "metrics": [ + { + "step": 250, + "valid accuracy": 0.0, + "train loss": 1.3017588877677917, + "train samples": 1000, + "train time": 32.843521857023006, + "eval time": 11.480974874997628, + "tokens / sec": 6446.294064372017, + "mem allocated avg": 6784826523.648, + "mem reserved avg": 11538438029.312, + "elapsed time": 95.45296428899746 + }, + { + "step": 500, + "valid accuracy": 0.28, + "train loss": 1.0202219936847687, + "train samples": 2000, + "train time": 32.35236015598639, + "eval time": 11.4980273259971, + "tokens / sec": 6429.051821788439, + "mem allocated avg": 6777359808.512, + "mem reserved avg": 11429948162.048, + "elapsed time": 183.95939499299857 + }, + { + "step": 750, + "valid accuracy": 0.38, + "train loss": 0.8040032889842987, + "train samples": 3000, + "train time": 32.52055500800634, + "eval time": 11.426841341002728, + "tokens / sec": 6592.784162115804, + "mem allocated avg": 6787965165.568, + "mem reserved avg": 11585061912.576, + "elapsed time": 272.8589564269969 + }, + { + "step": 1000, + "valid accuracy": 0.3, + "train loss": 0.7544035723209381, + "train samples": 4000, + "train time": 32.27830113501477, + "eval time": 11.54098314699877, + "tokens / sec": 6454.3669485133405, + "mem allocated avg": 6779215933.44, + "mem reserved avg": 11460172316.672, + "elapsed time": 361.1500098109973 + }, + { + "step": 1250, + "valid accuracy": 0.44, + "train loss": 0.7379197257757187, + "train samples": 5000, + "train time": 32.060909217962035, + "eval time": 11.406497389998549, + "tokens / sec": 6504.431879404317, + "mem allocated avg": 6779128844.288, + "mem reserved avg": 11454770053.12, + "elapsed time": 449.3482204989996 + }, + { + "step": 1500, + "valid accuracy": 0.4, + "train loss": 0.7252234178781509, + "train samples": 6000, + "train time": 31.98088176901365, + "eval time": 11.480169268001191, + "tokens / sec": 6545.504326988923, + "mem allocated avg": 6780286265.344, + "mem reserved avg": 11479667441.664, + "elapsed time": 537.3097453219998 + }, + { + "step": 1750, + "valid accuracy": 0.4, + "train loss": 0.7148357192277909, + "train samples": 7000, + "train time": 32.29452324002341, + "eval time": 11.44221062500219, + "tokens / sec": 6482.678144650271, + "mem allocated avg": 6782215264.256, + "mem reserved avg": 11493600919.552, + "elapsed time": 625.780868398997 + }, + { + "step": 2000, + "valid accuracy": 0.4, + "train loss": 0.7139411936998368, + "train samples": 8000, + "train time": 32.33002986999054, + "eval time": 11.472246884000924, + "tokens / sec": 6424.243987253105, + "mem allocated avg": 6778636718.08, + "mem reserved avg": 11439217573.888, + "elapsed time": 714.3076436519987 + }, + { + "step": 2250, + "valid accuracy": 0.38, + "train loss": 0.7067342863082886, + "train samples": 9000, + "train time": 32.69249906902769, + "eval time": 11.424881449998793, + "tokens / sec": 6574.841511692143, + "mem allocated avg": 6789716504.576, + "mem reserved avg": 11617542602.752, + "elapsed time": 803.4051666009982 + }, + { + "step": 2500, + "valid accuracy": 0.4, + "train loss": 0.7048580280542374, + "train samples": 10000, + "train time": 31.796681229010574, + "eval time": 11.401134708998143, + "tokens / sec": 6477.625715607085, + "mem allocated avg": 6775192217.6, + "mem reserved avg": 11386755219.456, + "elapsed time": 890.7853266579987 + }, + { + "step": 2750, + "valid accuracy": 0.36, + "train loss": 0.6994425257444382, + "train samples": 11000, + "train time": 32.589996781029186, + "eval time": 6.453208308001194, + "tokens / sec": 6501.412118068606, + "mem allocated avg": 6785945655.296, + "mem reserved avg": 11552530890.752, + "elapsed time": 974.6122346880002 + }, + { + "step": 3000, + "valid accuracy": 0.4, + "train loss": 0.6912879683971405, + "train samples": 12000, + "train time": 32.34826778500428, + "eval time": 11.457833226999355, + "tokens / sec": 6452.617536966281, + "mem allocated avg": 6780318763.008, + "mem reserved avg": 11474030297.088, + "elapsed time": 1062.897270567999 + }, + { + "step": 3250, + "valid accuracy": 0.4, + "train loss": 0.700449936747551, + "train samples": 13000, + "train time": 32.51472582996939, + "eval time": 8.004199169998174, + "tokens / sec": 6486.322569744963, + "mem allocated avg": 6782387701.76, + "mem reserved avg": 11501452656.64, + "elapsed time": 1148.3985279560002 + }, + { + "step": 3500, + "valid accuracy": 0.36, + "train loss": 0.6886729755401612, + "train samples": 14000, + "train time": 32.572147220984334, + "eval time": 11.456443364000734, + "tokens / sec": 6439.550901479111, + "mem allocated avg": 6781381988.352, + "mem reserved avg": 11484943876.096, + "elapsed time": 1237.2252680229976 + }, + { + "step": 3750, + "valid accuracy": 0.38, + "train loss": 0.6851948540210724, + "train samples": 15000, + "train time": 32.8770313250061, + "eval time": 8.042231839001033, + "tokens / sec": 6591.318962402083, + "mem allocated avg": 6791807023.104, + "mem reserved avg": 11653781389.312, + "elapsed time": 1323.4750151669978 + }, + { + "step": 4000, + "valid accuracy": 0.36, + "train loss": 0.7032276903390884, + "train samples": 16000, + "train time": 31.65130396198947, + "eval time": 7.9955749260007, + "tokens / sec": 6457.016754994822, + "mem allocated avg": 6773653422.08, + "mem reserved avg": 11367989903.36, + "elapsed time": 1407.2714081800004 + }, + { + "step": 4250, + "valid accuracy": 0.36, + "train loss": 0.684476065993309, + "train samples": 17000, + "train time": 32.02934406197164, + "eval time": 8.007123895000404, + "tokens / sec": 6599.854170943876, + "mem allocated avg": 6784119472.128, + "mem reserved avg": 11519949537.28, + "elapsed time": 1492.0019941529972 + }, + { + "step": 4500, + "valid accuracy": 0.38, + "train loss": 0.6939880999326706, + "train samples": 18000, + "train time": 31.936327281997364, + "eval time": 9.855819755000994, + "tokens / sec": 6507.260467522446, + "mem allocated avg": 6777879162.88, + "mem reserved avg": 11436331892.736, + "elapsed time": 1578.2498042659972 + }, + { + "step": 4750, + "valid accuracy": 0.36, + "train loss": 0.68637368786335, + "train samples": 19000, + "train time": 32.33460194401778, + "eval time": 6.469711448000453, + "tokens / sec": 6492.704019164238, + "mem allocated avg": 6781104441.344, + "mem reserved avg": 11484004352.0, + "elapsed time": 1662.171022565999 + }, + { + "step": 5000, + "valid accuracy": 0.38, + "train loss": 0.6926896897554398, + "train samples": 20000, + "train time": 32.14674746405217, + "eval time": 8.441190715999255, + "tokens / sec": 6479.038049896257, + "mem allocated avg": 6777818853.376, + "mem reserved avg": 11434117300.224, + "elapsed time": 1747.4833575960001 + }, + { + "step": 5000, + "test accuracy": 0.3684609552691433, + "train loss": 0.6926896897554398, + "train samples": 20000, + "train total tokens": 4198051 + } + ] + }, + "meta_info": { + "model_info": { + "sha": "13afe5124825b4f3751f836b40dafda64c1ed062", + "created_at": "2024-09-18T15:23:48+00:00" + }, + "dataset_info": { + "metamath": { + "sha": "aa4f34d3d2d3231299b5b03d9b3e5a20da45aa18", + "created_at": "2023-09-21T17:22:46+00:00" + }, + "gsm8k": { + "sha": "e53f048856ff4f594e959d75785d2c2d37b678ee", + "created_at": "2022-04-12T10:22:10+00:00" + } + }, + "package_info": { + "transformers-version": "4.52.4", + "transformers-commit-hash": null, + "peft-version": "0.15.2.dev0", + "peft-commit-hash": "5fe7f8f8abe914d313fc3751f2ea92de7718fbaf", + "datasets-version": "3.6.0", + "datasets-commit-hash": null, + "bitsandbytes-version": "0.46.0", + "bitsandbytes-commit-hash": null, + "torch-version": "2.7.1+cu126", + "torch-commit-hash": null + }, + "system_info": { + "system": "Linux", + "release": "6.8.0-1029-aws", + "version": "#31-Ubuntu SMP Wed Apr 23 18:42:41 UTC 2025", + "machine": "x86_64", + "processor": "x86_64", + "accelerator": "NVIDIA L40S" + }, + "pytorch_info": "PyTorch built with:\n - GCC 11.2\n - C++ Version: 201703\n - Intel(R) oneAPI Math Kernel Library Version 2024.2-Product Build 20240605 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v3.7.1 (Git Hash 8d263e693366ef8db40acc569cc7d8edf644556d)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 12.6\n - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n - CuDNN 90.7.1 (built against CUDA 12.8)\n - Built with CuDNN 90.5.1\n - Magma 2.6.1\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, COMMIT_SHA=e2d141dbde55c2a4370fac5165b0561b6af4798b, CUDA_VERSION=12.6, CUDNN_VERSION=9.5.1, CXX_COMPILER=/opt/rh/gcc-toolset-11/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=1 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=range-loop-construct -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-unknown-pragmas -Wno-unused-parameter -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, TORCH_VERSION=2.7.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, \n" + } +} diff --git a/peft/method_comparison/MetaMathQA/run.py b/peft/method_comparison/MetaMathQA/run.py new file mode 100644 index 0000000000000000000000000000000000000000..0d159220eb881ade49ff4fb8ad5d12d8cd7e9c69 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/run.py @@ -0,0 +1,473 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Main entry point to run the experiments. Contains general setup and the proper training code. +""" + +import argparse +import datetime as dt +import gc +import json +import os +import random +import sys +import textwrap +import time +from contextlib import AbstractContextManager, nullcontext +from functools import partial +from typing import Any, Callable, Literal, Optional + +import torch +from torch import nn +from torch.amp import GradScaler, autocast +from tqdm import tqdm +from transformers import GenerationConfig, set_seed +from utils import ( + FILE_NAME_TRAIN_PARAMS, + BucketIterator, + TrainResult, + TrainStatus, + get_accuracy, + get_base_model_info, + get_dataset_info, + get_file_size, + get_model, + get_optimizer_and_scheduler, + get_peft_branch, + get_tokenizer, + get_train_config, + init_accelerator, + log_results, + validate_experiment_path, +) + +from data import get_train_valid_test_datasets +from peft import AdaLoraConfig, PeftConfig +from peft.utils import infer_device, CONFIG_NAME + + +# # suppress all warnings +# warnings.filterwarnings("ignore") # FIXME? + +dtype_to_bytes_linear = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5} +# if lr scheduler with warmup is used, the ratio of warmup steps to total steps +BUCKET_FACTOR = 20 # number of batches per bucket, increasing this further has diminishing returns + + +def get_generation_config(*, seq_len, generate_kwargs) -> GenerationConfig: + # filter out None values so that we don't depend on setting correct defaults in the config + generation_kwargs = {k: v for k, v in generate_kwargs.items() if v is not None} + if ("max_length" in generation_kwargs) and ("max_new_tokens" in generation_kwargs): + # transformers does not support setting both max_length and max_new_tokens, but what we want in this case is to + # take the smaller of the two values + new_max_length = min(generation_kwargs["max_new_tokens"] + seq_len, generation_kwargs["max_length"]) + del generation_kwargs["max_new_tokens"] + generation_kwargs["max_length"] = new_max_length + generation_config = GenerationConfig(**generate_kwargs) + return generation_config + + +def evaluate(model, tokenizer, ds, batch_size, generate_kwargs, use_tqdm: bool = False) -> tuple[list[str], list[str]]: + with torch.inference_mode(): + predictions = [] + responses = [] + pbar = range(0, len(ds), batch_size) + if use_tqdm: + pbar = tqdm(pbar) + for j in pbar: + sliced = ds[j : j + batch_size] + responses += sliced.pop("response") + batch = tokenizer.pad(sliced, return_tensors="pt", padding_side="left").to(model.device) + seq_len = batch["input_ids"].shape[1] + generation_config = get_generation_config(seq_len=seq_len, generate_kwargs=generate_kwargs) + outputs = model.generate(**batch, generation_config=generation_config, pad_token_id=tokenizer.eos_token_id) + predictions += tokenizer.batch_decode(outputs, skip_special_tokens=True) + return predictions, responses + + +class DummyGradScaler: + # if no mixed precision is being used + def scale(self, loss): + return loss + + def unscale_(self, optimizer): + pass + + def step(self, optimizer): + optimizer.step() + + def update(self): + pass + + +def train( + *, + model: nn.Module, + max_steps: int, + batch_size: int, + batch_size_eval: int, + tokenizer: Any, + accelerator_memory_init: int, + eval_steps: int, + generation_kwargs: dict[str, Any], + grad_norm_clip: float, + optimizer_type: str, + optimizer_kwargs: dict[str, Any], + query_template: str, + lr_scheduler_arg: Optional[Literal["cosine"]], + use_amp: bool, + is_adalora: bool, +) -> TrainResult: + accelerator_memory_allocated_log = [] + accelerator_memory_reserved_log = [] + losses = [] + durations = [] + metrics = [] + sample = 0 # keep count of the current sample + total_samples = 0 # total number of samples over all epochs + total_tokens = [] # total number of tokens over all epochs + + device_type = infer_device() + torch_accelerator_module = getattr(torch, device_type, torch.cuda) + if use_amp: + grad_scaler: GradScaler | DummyGradScaler = GradScaler(device=device_type) + autocast_ctx: Callable[[], ContextManager[Any]] = partial(autocast, device_type=device_type) + else: + grad_scaler = DummyGradScaler() + autocast_ctx = nullcontext + + optimizer, lr_scheduler = get_optimizer_and_scheduler( + model, + optimizer_type=optimizer_type, + max_steps=max_steps, + lr_scheduler_arg=lr_scheduler_arg, + **optimizer_kwargs, + ) + # print this after getting the optimizer, in case it modifies requires_gard + if hasattr(model, "get_nb_trainable_parameters"): + num_trainable_params, num_params = model.get_nb_trainable_parameters() + else: + num_params = model.num_parameters() + num_trainable_params = num_params + print_verbose( + f"trainable params: {num_trainable_params:,d} || all params: {num_params:,d} || " + f"trainable: {100 * num_trainable_params / num_params:.4f}%" + ) + + status = TrainStatus.FAILED + tic_train = time.perf_counter() + eval_time = 0.0 + error_msg = "" + + ds_train, ds_valid, ds_test = get_train_valid_test_datasets( + tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose + ) + # note: bucketing by length is only really worth it for the train dataset, since it's length is big compared to the + # batch size + iterator_train = BucketIterator( + ds_train, + batch_size=batch_size, + bucket_factor=BUCKET_FACTOR, + delete_cols=["response"], + ) + try: + pbar = tqdm(range(1, max_steps + 1)) + for step, batch in zip(pbar, iterator_train): + tic = time.perf_counter() + + # create the batch + tokens_per_sample = [len(i) for i in batch["input_ids"]] + total_tokens.append(sum(tokens_per_sample) + len(tokens_per_sample)) # add EOS token + batch = tokenizer.pad(batch, return_tensors="pt").to(model.device) + actual_batch_size = len(batch["input_ids"]) + total_samples += actual_batch_size + sample += batch_size + if sample >= len(ds_train): # new epoch + sample = 0 + + # add labels, they are automatically shifted by transformers + labels = batch["input_ids"].clone() + # We want to ignore the padding tokens except for the first EOS token; if we don't ignore them, the loss + # will be dominated by padding tokens; if we ignore all, the model will not learn to predict the EOS token. + # TODO: Note that the longest sequence in the batch won't have any PAD/EOS token at the end, this is fine if + # the batch size is > 1 but should still be fixed eventually. + for i, num_tokens in enumerate(tokens_per_sample): + labels[i, num_tokens + 1 :] = -100 + batch["labels"] = labels + num_items_in_batch = batch["attention_mask"].sum().item() + + # train step + optimizer.zero_grad() + with autocast_ctx(): + outputs = model(**batch, num_items_in_batch=num_items_in_batch) + loss = outputs.loss + grad_scaler.scale(loss).backward() + if grad_norm_clip: + grad_scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip) + grad_scaler.step(optimizer) + grad_scaler.update() + lr_scheduler.step() + + if is_adalora: + model.base_model.update_and_allocate(step) + + losses.append(loss.item()) + pbar.set_postfix({"loss": loss.item()}) + accelerator_memory_allocated_log.append( + torch_accelerator_module.memory_allocated() - accelerator_memory_init + ) + accelerator_memory_reserved_log.append( + torch_accelerator_module.memory_reserved() - accelerator_memory_init + ) + toc = time.perf_counter() + durations.append(toc - tic) + + # every couple of steps, evaluate; this can be slow due to generation + if step % eval_steps == 0: + tic_eval = time.perf_counter() + loss_avg = sum(losses[-eval_steps:]) / eval_steps + memory_allocated_avg = sum(accelerator_memory_allocated_log[-eval_steps:]) / eval_steps + memory_reserved_avg = sum(accelerator_memory_reserved_log[-eval_steps:]) / eval_steps + token_sum = sum(total_tokens[-eval_steps:]) + dur_train = sum(durations[-eval_steps:]) + tokens_per_sec = token_sum / dur_train + + model.eval() + predictions, responses = evaluate( + model=model, + tokenizer=tokenizer, + ds=ds_valid, + batch_size=batch_size_eval, + generate_kwargs={**generation_kwargs}, + ) + model.train() + + example = random.choice(predictions) + example = textwrap.shorten(example, width=750) + example = textwrap.indent(example, " ") + print_verbose(f"\nExample prediction:\n{example}\n") + accuracy = get_accuracy(predictions=predictions, responses=responses) + num_tokens_generated = sum(sum(mask) for mask in tokenizer(predictions)["attention_mask"]) + + toc_eval = time.perf_counter() + dur_eval = toc_eval - tic_eval + eval_time += toc_eval - tic_eval + elapsed = time.perf_counter() - tic_train + + metrics.append( + { + "step": step, + "valid accuracy": accuracy, + "train loss": loss_avg, + "train samples": total_samples, + "train time": dur_train, + "eval time": dur_eval, + "tokens / sec": tokens_per_sec, + "mem allocated avg": memory_allocated_avg, + "mem reserved avg": memory_reserved_avg, + "elapsed time": elapsed, + } + ) + + log_dict = { + "step": f"{step:5d}", + "samples": f"{total_samples:7d}", + "lr": f"{lr_scheduler.get_last_lr()[0]:.2e}", + "loss avg": f"{loss_avg:.4f}", + "valid acc": f"{accuracy:.3f}", + "gen valid tokens": num_tokens_generated, + "train time": f"{dur_train:.1f}s", + "eval time": f"{dur_eval:.1f}s", + "train tokens / sec": f"{tokens_per_sec:.0f}", + "mem allocated": f"{memory_allocated_avg:.0f}", + "mem reserved": f"{memory_reserved_avg:.0f}", + "elapsed time": f"{elapsed // 60:.0f}min {elapsed % 60:.0f}s", + } + print_verbose(json.dumps(log_dict)) + + # # TODO is this needed? + torch_accelerator_module.empty_cache() + gc.collect() + + print_verbose(f"Training finished after {max_steps} steps, evaluation on test set follows.") + # test set evaluation + model.eval() + predictions, responses = evaluate( + model=model, + tokenizer=tokenizer, + ds=ds_test, + batch_size=batch_size_eval, + generate_kwargs={**generation_kwargs, "pad_token_id": tokenizer.eos_token_id}, + use_tqdm=len(ds_test) > 100, + ) + accuracy = get_accuracy(predictions=predictions, responses=responses) + metrics.append( + { + "step": step, + "test accuracy": accuracy, + "train loss": sum(losses[-eval_steps:]) / eval_steps, + "train samples": total_samples, + "train total tokens": sum(total_tokens), + } + ) + print_verbose(f"Test accuracy: {accuracy:.3f}") + + except KeyboardInterrupt: + print_verbose("canceled training") + status = TrainStatus.CANCELED + error_msg = "manually canceled" + except torch.OutOfMemoryError as exc: + # ouch, still let's try to log some results + print_verbose("out of memory error encountered") + status = TrainStatus.CANCELED + error_msg = str(exc) + except Exception as exc: + print_verbose(f"encountered an error: {exc}") + status = TrainStatus.CANCELED + error_msg = str(exc) + + toc_train = time.perf_counter() + train_time = toc_train - tic_train - eval_time + + if status != TrainStatus.CANCELED: + status = TrainStatus.SUCCESS + train_result = TrainResult( + status=status, + train_time=train_time, + accelerator_memory_reserved_log=accelerator_memory_reserved_log, + losses=losses, + metrics=metrics, + error_msg=error_msg, + num_trainable_params=num_trainable_params, + num_total_params=num_params, + ) + return train_result + + +def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None: + tic_total = time.perf_counter() + start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat() + + peft_branch = get_peft_branch() + if peft_branch == "main": + print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======") + else: + print_verbose( + f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======" + ) + + # load configs + peft_config: Optional[PeftConfig] = None + if os.path.exists(os.path.join(path_experiment, CONFIG_NAME)): + peft_config = PeftConfig.from_pretrained(path_experiment) + else: + print_verbose(f"Could not find PEFT config at {path_experiment}, performing FULL FINETUNING") + path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS) + train_config = get_train_config(path_train_config) + set_seed(train_config.seed) + + # initialize objects + accelerator_memory_init = init_accelerator() + tokenizer = get_tokenizer(model_id=train_config.model_id, max_seq_length=train_config.max_seq_length) + + model_info = get_base_model_info(train_config.model_id) + metamath_info = get_dataset_info("meta-math/MetaMathQA") + gsm8k_info = get_dataset_info("openai/gsm8k") + model = get_model( + model_id=train_config.model_id, + dtype=train_config.dtype, + compile=train_config.compile, + attn_implementation=train_config.attn_implementation, + peft_config=peft_config, + autocast_adapter_dtype=train_config.autocast_adapter_dtype, + ) + print_verbose(model) + + # train model + train_result = train( + model=model, + max_steps=train_config.max_steps, + batch_size=train_config.batch_size, + batch_size_eval=train_config.batch_size_eval, + tokenizer=tokenizer, + accelerator_memory_init=accelerator_memory_init, + eval_steps=train_config.eval_steps, + generation_kwargs=train_config.generation_kwargs, + grad_norm_clip=train_config.grad_norm_clip, + optimizer_type=train_config.optimizer_type, + optimizer_kwargs=train_config.optimizer_kwargs, + query_template=train_config.query_template, + lr_scheduler_arg=train_config.lr_scheduler, + use_amp=train_config.use_amp, + is_adalora=isinstance(peft_config, AdaLoraConfig), + ) + + if train_result.status == TrainStatus.FAILED: + print_verbose("Training failed, not logging results") + sys.exit(1) + + file_size = get_file_size( + model, + peft_config=peft_config, + clean=clean, + print_fn=print_verbose, + ) + + time_total = time.perf_counter() - tic_total + # log results: print and save to file + log_results( + experiment_name=experiment_name, + train_result=train_result, + accelerator_memory_init=accelerator_memory_init, + time_total=time_total, + file_size=file_size, + model_info=model_info, + datasets_info={"metamath": metamath_info, "gsm8k": gsm8k_info}, + start_date=start_date, + train_config=train_config, + peft_config=peft_config, + print_fn=print_verbose, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-v", "--verbose", action="store_true", help="Enable verbose output") + parser.add_argument("path_experiment", type=str, help="Path to the experiment directory") + parser.add_argument( + "--clean", + action="store_true", + help="Delete training artifacts after run finishes (logs are still saved)", + ) + args = parser.parse_args() + + experiment_name = validate_experiment_path(args.path_experiment) + + if args.verbose: + + def print_verbose(*args, **kwargs) -> None: + kwargs["file"] = sys.stderr + print(*args, **kwargs) + else: + + def print_verbose(*args, **kwargs) -> None: + pass + + main( + path_experiment=args.path_experiment, + experiment_name=experiment_name, + clean=args.clean, + ) diff --git a/peft/method_comparison/MetaMathQA/temporary_results/.gitkeep b/peft/method_comparison/MetaMathQA/temporary_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/MetaMathQA/utils.py b/peft/method_comparison/MetaMathQA/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d48a301b35b7759d947f18a121d906fce55b3fb4 --- /dev/null +++ b/peft/method_comparison/MetaMathQA/utils.py @@ -0,0 +1,709 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +All utilities not related to data handling. +""" + +import enum +import json +import os +import platform +import subprocess +import tempfile +import warnings +from dataclasses import asdict, dataclass +from decimal import Decimal, DivisionByZero, InvalidOperation +from typing import Any, Callable, Literal, Optional + +import bitsandbytes +import datasets +import huggingface_hub +import numpy as np +import torch +import transformers +from torch import nn +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + get_cosine_schedule_with_warmup, +) + +import peft +from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training +from peft.optimizers import create_lorafa_optimizer, create_loraplus_optimizer +from peft.utils import infer_device, SAFETENSORS_WEIGHTS_NAME + +device = infer_device() + +if device not in ["cuda", "xpu"]: + raise RuntimeError("CUDA or XPU is not available, currently only CUDA or XPU is supported") + +ACCELERATOR_MEMORY_INIT_THRESHOLD = 500 * 2**20 # 500MB +FILE_NAME_DEFAULT_TRAIN_PARAMS = os.path.join(os.path.dirname(__file__), "default_training_params.json") +FILE_NAME_TRAIN_PARAMS = "training_params.json" # specific params for this experiment +# main results +RESULT_PATH = os.path.join(os.path.dirname(__file__), "results") +# testing results +RESULT_PATH_TEST = os.path.join(os.path.dirname(__file__), "temporary_results") +# cancelled results +RESULT_PATH_CANCELLED = os.path.join(os.path.dirname(__file__), "cancelled_results") +hf_api = huggingface_hub.HfApi() +WARMUP_STEP_RATIO = 0.1 + + +@dataclass +class TrainConfig: + """All configuration parameters associated with training the model + + Args: + model_id: The model identifier + dtype: The data type to use for the model + max_seq_length: The maximum sequence length + batch_size: The batch size for training + batch_size_eval: The batch size for eval/test, can be much higher than for training + max_steps: The maximum number of steps to train for + eval_steps: The number of steps between evaluations + compile: Whether to compile the model + query_template: The template for the query + seed: The random seed + grad_norm_clip: The gradient norm clipping value (set to 0 to skip) + optimizer_type: The name of a torch optimizer (e.g. AdamW) or a PEFT method ("lora+", "lora-fa") + optimizer_kwargs: The optimizer keyword arguments (lr etc.) + lr_scheduler: The learning rate scheduler (currently only None or 'cosine' are supported) + use_amp: Whether to use automatic mixed precision + autocast_adapter_dtype: Whether to cast adapter dtype to float32, same argument as in PEFT + generation_kwargs: Arguments passed to transformers GenerationConfig (used in evaluation) + attn_implementation: The attention implementation to use (if any), see transformers docs + """ + + model_id: str + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"] + max_seq_length: int + batch_size: int + batch_size_eval: int + max_steps: int + eval_steps: int + compile: bool + query_template: str + seed: int + grad_norm_clip: float # set to 0 to skip + optimizer_type: str + optimizer_kwargs: dict[str, Any] + lr_scheduler: Optional[Literal["cosine"]] + use_amp: bool + autocast_adapter_dtype: bool + generation_kwargs: dict[str, Any] + attn_implementation: Optional[str] + + def __post_init__(self) -> None: + if not isinstance(self.model_id, str): + raise ValueError(f"Invalid model_id: {self.model_id}") + if self.dtype not in ["float32", "float16", "bfloat16", "int8", "int4"]: + raise ValueError(f"Invalid dtype: {self.dtype}") + if self.max_seq_length < 0: + raise ValueError(f"Invalid max_seq_length: {self.max_seq_length}") + if self.batch_size <= 0: + raise ValueError(f"Invalid batch_size: {self.batch_size}") + if self.batch_size_eval <= 0: + raise ValueError(f"Invalid eval batch_size: {self.batch_size_eval}") + if self.max_steps <= 0: + raise ValueError(f"Invalid max_steps: {self.max_steps}") + if self.eval_steps <= 0: + raise ValueError(f"Invalid eval_steps: {self.eval_steps}") + if self.eval_steps > self.max_steps: + raise ValueError(f"Invalid eval_steps: {self.eval_steps} > max_steps: {self.max_steps}") + if self.grad_norm_clip < 0: + raise ValueError(f"Invalid grad_norm_clip: {self.grad_norm_clip}") + if self.optimizer_type not in ["lora+", "lora-fa"] and not hasattr(torch.optim, self.optimizer_type): + raise ValueError(f"Invalid optimizer_type: {self.optimizer_type}") + if self.lr_scheduler not in [None, "cosine"]: + raise ValueError(f"Invalid lr_scheduler: {self.lr_scheduler}, must be None or 'cosine'") + if "{query}" not in self.query_template: + raise ValueError("Invalid query_template, must contain '{query}'") + + +def validate_experiment_path(path: str) -> str: + # the experiment path should take the form of ./experiments// + # e.g. ./experiments/lora/rank32 + # it should contain: + # - adapter_config.json + # - optional: training_params.json + if not os.path.exists(FILE_NAME_DEFAULT_TRAIN_PARAMS): + raise FileNotFoundError( + f"Missing default training params file '{FILE_NAME_DEFAULT_TRAIN_PARAMS}' in the ./experiments directory" + ) + if not os.path.exists(path): + raise FileNotFoundError(f"Path {path} does not exist") + + # check path structure + path_parts = path.rstrip(os.path.sep).split(os.path.sep) + if (len(path_parts) != 3) or (path_parts[-3] != "experiments"): + raise ValueError( + f"Path {path} does not have the correct structure, should be ./experiments//" + ) + + experiment_name = os.path.join(*path_parts[-2:]) + return experiment_name + + +def get_train_config(path: str) -> TrainConfig: + # first, load the default params, then update with experiment-specific params + with open(FILE_NAME_DEFAULT_TRAIN_PARAMS) as f: + default_config_kwargs = json.load(f) + + config_kwargs = {} + if os.path.exists(path): + with open(path) as f: + config_kwargs = json.load(f) + + config_kwargs = {**default_config_kwargs, **config_kwargs} + return TrainConfig(**config_kwargs) + + +def init_accelerator() -> int: + torch_accelerator_module = getattr(torch, device, torch.cuda) + torch.manual_seed(0) + torch_accelerator_module.reset_peak_memory_stats() + torch_accelerator_module.manual_seed_all(0) + # might not be necessary, but just to be sure + nn.Linear(1, 1).to(device) + + accelerator_memory_init = torch_accelerator_module.max_memory_reserved() + if accelerator_memory_init > ACCELERATOR_MEMORY_INIT_THRESHOLD: + raise RuntimeError( + f"{device} memory usage at start is too high: {accelerator_memory_init // 2**20}MB, please ensure that no other " + f"processes are running on {device}." + ) + + torch_accelerator_module.reset_peak_memory_stats() + accelerator_memory_init = torch_accelerator_module.max_memory_reserved() + return accelerator_memory_init + + +def get_tokenizer(*, model_id: str, max_seq_length: int): + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.model_max_length = max_seq_length + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_base_model( + *, + model_id: str, + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"], + compile: bool, + attn_implementation: Optional[str], +) -> nn.Module: + kwargs: dict[str, Any] = { + "pretrained_model_name_or_path": model_id, + "device_map": device, + "attn_implementation": attn_implementation, + } + if dtype == "int4": + quant_config = BitsAndBytesConfig(load_in_4bit=True) + kwargs["quantization_config"] = quant_config + elif dtype == "int8": + quant_config = BitsAndBytesConfig(load_in_8bit=True) + kwargs["quantization_config"] = quant_config + elif dtype == "bfloat16": + kwargs["torch_dtype"] = torch.bfloat16 + elif dtype == "float16": + kwargs["torch_dtype"] = torch.float16 + elif dtype != "float32": + raise ValueError(f"Invalid dtype: {dtype}") + + model = AutoModelForCausalLM.from_pretrained(**kwargs) + + if dtype in ["int8", "int4"]: + model = prepare_model_for_kbit_training(model) + + if compile: + model = torch.compile(model) + + return model + + +def get_model( + *, + model_id: str, + dtype: Literal["float32", "float16", "bfloat16", "int8", "int4"], + compile: bool, + attn_implementation: Optional[str], + peft_config: Optional[PeftConfig], + autocast_adapter_dtype: bool, +) -> nn.Module: + base_model = get_base_model( + model_id=model_id, dtype=dtype, compile=compile, attn_implementation=attn_implementation + ) + if peft_config is None: + model = base_model + else: + model = get_peft_model(base_model, peft_config, autocast_adapter_dtype=autocast_adapter_dtype) + return model + + +class DummyScheduler: + # if no lr scheduler is being used + def __init__(self, lr): + self.lr = lr + + def get_last_lr(self): + return [self.lr] + + def step(self): + pass + + +def get_optimizer_and_scheduler( + model, *, optimizer_type: str, max_steps: int, lr_scheduler_arg: Optional[Literal["cosine"]], **optimizer_kwargs +) -> tuple[torch.optim.Optimizer, Any]: + if optimizer_type == "lora+": + optimizer = create_loraplus_optimizer(model, optimizer_cls=torch.optim.AdamW, **optimizer_kwargs) + elif optimizer_type == "lora-fa": + optimizer = create_lorafa_optimizer(model, **optimizer_kwargs) + else: + cls = getattr(torch.optim, optimizer_type) + optimizer = cls(model.parameters(), **optimizer_kwargs) + + if lr_scheduler_arg == "cosine": + warmup_steps = int(WARMUP_STEP_RATIO * max_steps) + lr_scheduler = get_cosine_schedule_with_warmup( + optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps + ) + elif lr_scheduler_arg is None: + lr_scheduler = DummyScheduler(optimizer_kwargs["lr"]) + else: + raise ValueError(f"Invalid lr_scheduler argument: {lr_scheduler_arg}") + + return optimizer, lr_scheduler + + +class BucketIterator: + """ + Iterator that yields batches of data from a torch Dataset, grouped in buckets by sequence length + + The iterator will yield batches of size `batch_size`, where the samples in each batch are sorted by sequence length. + This is done to minimize the amount of padding required for each batch. To avoid sorting the entire dataset and thus + introducing a bias, the dataset is first split into buckets of size `batch_size * bucket_factor`. + + Args: + ds: The torch Dataset to iterate over + batch_size: The batch size + bucket_factor: The factor by which to multiply the batch size to determine the bucket size + delete_cols: The columns to delete from the dataset before yielding a batch + """ + + def __init__(self, ds, *, batch_size: int, bucket_factor: int, delete_cols: list[str]) -> None: + self.ds = ds + self.batch_size = batch_size + self.bucket_factor = bucket_factor + self.delete_cols = set(delete_cols) + + assert self.bucket_factor > 0, "bucket_factor must be greater than 0" + + def _batch_iterator(self, bucket): + tokens_per_sample_bucket = torch.tensor([len(i) for i in bucket["input_ids"]]) + # sort long to short instead to encounter possible OOM errors as early as possible + sorted = torch.argsort(tokens_per_sample_bucket, descending=True) + cls = type(bucket) # conserve the type returned by the ds + bucket = {k: [v[i] for i in sorted] for k, v in bucket.items() if k not in self.delete_cols} + num_samples = len(bucket["input_ids"]) + for j in range(0, num_samples, self.batch_size): + batch = {k: v[j : j + self.batch_size] for k, v in bucket.items()} + yield cls(batch) + + def __iter__(self): + bucket_size = self.batch_size * self.bucket_factor + for i in range(0, len(self.ds), bucket_size): + bucket = self.ds[i : i + bucket_size] + yield from self._batch_iterator(bucket) + + # if there is a remainder, we yield the last batch + if len(self.ds) % bucket_size != 0: + bucket = self.ds[-(len(self.ds) % bucket_size) :] + yield from self._batch_iterator(bucket) + + +def get_file_size( + model: nn.Module, *, peft_config: Optional[PeftConfig], clean: bool, print_fn: Callable[..., None] +) -> int: + file_size = 99999999 # set a default dummy value + if peft_config is not None: + try: + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir: + model.save_pretrained(tmp_dir) + stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME)) + file_size = stat.st_size + if not clean: + print_fn(f"Saved PEFT checkpoint to {tmp_dir}") + except Exception as exc: + print(f"Failed to save PEFT checkpoint due to the following error: {exc}") + else: + print_fn("Not saving the fully fine-tuned model because it's too big, estimating the size instead") + try: + num_params = model.num_parameters() + dtype_size = next(model.parameters()).element_size() + file_size = num_params * dtype_size + except Exception as exc: + print(f"Failed to determine file size for fully finetuned model because of: {exc}") + return file_size + + +################## +# ANSWER PARSING # +################## + + +def parse_answer(text: str) -> Optional[str]: + """ + A label/prediction can look like this: + + Question: If the magnitude of vector v is equal to 4, what is the dot product of vector v with itself?. Think step + by step + Answer: The dot product of a vector with itself is equal to the square of its magnitude. So, the dot product of + vector v with itself is equal to $4^2 = \boxed{16}$.The answer is: 16 + + We want to extract '16' from this string. + + """ + # This implementation is based on sampling meta-llama/Llama-3.1-8B-Instruct. It may not work for other models. + candidate_delimiters = [ + # MetaMath: + "The answer is: ", + "The answer is ", + "The final answer is: ", + "The final answer is ", + # GSM8K: + "#### ", + ] + text = text.strip() + text = text.rstrip(".!?") + for delimiter in candidate_delimiters: + if delimiter in text: + break + else: # no match + return None + + text = text.rpartition(delimiter)[-1].strip() + # if a new paragraph follows after the final answer, we want to remove it + text = text.split("\n", 1)[0] + # note: we can just remove % here since the GSM8K dataset just omits it, i.e. 50% -> 50, no need to divide by 100 + text = text.strip(" .!?$%") + return text + + +def convert_to_decimal(s: Optional[str]) -> Optional[Decimal]: + """ + Converts a string representing a number to a Decimal. + + The string may be: + - A simple number (e.g., "13", "65.33") + - A fraction (e.g., "20/14") + """ + if s is None: + return None + + try: + s = s.strip() + # Check if the string represents a fraction. + if "/" in s: + parts = s.split("/") + if len(parts) != 2: + return None + numerator = Decimal(parts[0].strip()) + denominator = Decimal(parts[1].strip()) + if denominator == 0: + return None + value = numerator / denominator + else: + # Parse as a regular decimal or integer string. + value = Decimal(s) + return value + except (DivisionByZero, InvalidOperation, ValueError): + return None + + +def get_accuracy(*, predictions: list[str], responses: list[str]) -> float: + if len(predictions) != len(responses): + raise ValueError(f"Prediction length mismatch: {len(predictions)} != {len(responses)}") + + y_true: list[str | float | None] = [] + y_pred: list[str | float | None] = [] + + for prediction, response in zip(predictions, responses): + parsed_prediction = parse_answer(prediction) + parsed_response = parse_answer(response) + if parsed_response is None: + raise ValueError(f"Error encountered while trying to parse response: {response}") + + decimal_prediction = convert_to_decimal(parsed_prediction) + decimal_answer = convert_to_decimal(parsed_response) + if decimal_prediction is not None: + y_pred.append(float(decimal_prediction)) + elif parsed_prediction is not None: + y_pred.append(parsed_prediction) + else: + y_pred.append(None) + + # we convert decimals to float so that stuff like this works: + # float(convert_to_decimal('20/35')) == float(convert_to_decimal('0.5714285714285714')) + if decimal_answer is not None: + y_true.append(float(decimal_answer)) + elif parsed_prediction is not None: + y_true.append(parsed_response) + else: + y_true.append(None) + + correct: list[bool] = [] + for true, pred in zip(y_true, y_pred): + if (true is not None) and (pred is not None): + correct.append(true == pred) + else: + correct.append(False) + + accuracy = sum(correct) / len(correct) + return accuracy + + +########### +# LOGGING # +########### + + +def get_base_model_info(model_id: str) -> Optional[huggingface_hub.ModelInfo]: + try: + return hf_api.model_info(model_id) + except Exception as exc: + warnings.warn(f"Could not retrieve model info, failed with error {exc}") + return None + + +def get_dataset_info(dataset_id: str) -> Optional[huggingface_hub.DatasetInfo]: + try: + return hf_api.dataset_info(dataset_id) + except Exception as exc: + warnings.warn(f"Could not retrieve dataset info, failed with error {exc}") + return None + + +def get_git_hash(module) -> Optional[str]: + if "site-packages" in module.__path__[0]: + return None + + return subprocess.check_output("git rev-parse HEAD".split(), cwd=os.path.dirname(module.__file__)).decode().strip() + + +def get_package_info() -> dict[str, Optional[str]]: + """Get the package versions and commit hashes of transformers, peft, datasets, bnb, and torch""" + package_info = { + "transformers-version": transformers.__version__, + "transformers-commit-hash": get_git_hash(transformers), + "peft-version": peft.__version__, + "peft-commit-hash": get_git_hash(peft), + "datasets-version": datasets.__version__, + "datasets-commit-hash": get_git_hash(datasets), + "bitsandbytes-version": bitsandbytes.__version__, + "bitsandbytes-commit-hash": get_git_hash(bitsandbytes), + "torch-version": torch.__version__, + "torch-commit-hash": get_git_hash(torch), + } + return package_info + + +def get_system_info() -> dict[str, str]: + device = infer_device() + torch_accelerator_module = getattr(torch, device, torch.cuda) + system_info = { + "system": platform.system(), + "release": platform.release(), + "version": platform.version(), + "machine": platform.machine(), + "processor": platform.processor(), + "accelerator": torch_accelerator_module.get_device_name(0), + } + return system_info + + +@dataclass +class MetaInfo: + package_info: dict[str, Optional[str]] + system_info: dict[str, str] + pytorch_info: str + + +def get_meta_info() -> MetaInfo: + meta_info = MetaInfo( + package_info=get_package_info(), + system_info=get_system_info(), + pytorch_info=torch.__config__.show(), + ) + return meta_info + + +def get_peft_branch() -> str: + return ( + subprocess.check_output("git rev-parse --abbrev-ref HEAD".split(), cwd=os.path.dirname(peft.__file__)) + .decode() + .strip() + ) + + +class TrainStatus(enum.Enum): + FAILED = "failed" + SUCCESS = "success" + CANCELED = "canceled" + + +@dataclass +class TrainResult: + status: TrainStatus + train_time: float + accelerator_memory_reserved_log: list[int] + losses: list[float] + metrics: list[Any] # TODO + error_msg: str + num_trainable_params: int + num_total_params: int + + +def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None: + accelerator_memory_max = log_data["train_info"]["accelerator_memory_max"] + accelerator_memory_avg = log_data["train_info"]["accelerator_memory_reserved_avg"] + accelerator_memory_reserved_99th = log_data["train_info"]["accelerator_memory_reserved_99th"] + time_train = log_data["train_info"]["train_time"] + time_total = log_data["run_info"]["total_time"] + file_size = log_data["train_info"]["file_size"] + + print_fn(f"accelerator memory max: {accelerator_memory_max // 2**20}MB") + print_fn(f"accelerator memory reserved avg: {accelerator_memory_avg // 2**20}MB") + print_fn(f"accelerator memory reserved 99th percentile: {accelerator_memory_reserved_99th // 2**20}MB") + print_fn(f"train time: {time_train}s") + print_fn(f"total time: {time_total:.2f}s") + print_fn(f"file size of checkpoint: {file_size / 2**20:.1f}MB") + + +def log_to_file( + *, log_data: dict, save_dir: str, experiment_name: str, timestamp: str, print_fn: Callable[..., None] +) -> None: + if save_dir.endswith(RESULT_PATH): + file_name = f"{experiment_name.replace(os.path.sep, '--')}.json" + else: + # For cancelled and temporary runs, we want to include the timestamp, as these runs are not tracked in git, thus + # we need unique names to avoid losing history. + file_name = f"{experiment_name.replace(os.path.sep, '--')}--{timestamp.replace(':', '-')}.json" + file_name = os.path.join(save_dir, file_name) + with open(file_name, "w") as f: + json.dump(log_data, f, indent=2) + print_fn(f"Saved log to: {file_name}") + + +def log_results( + *, + experiment_name: str, + train_result: TrainResult, + accelerator_memory_init: int, + time_total: float, + file_size: int, + model_info: Optional[huggingface_hub.ModelInfo], + datasets_info: dict[str, Optional[huggingface_hub.DatasetInfo]], + start_date: str, + train_config: TrainConfig, + peft_config: Optional[PeftConfig], + print_fn: Callable[..., None], +) -> None: + # collect results + device = infer_device() + torch_accelerator_module = getattr(torch, device, torch.cuda) + accelerator_memory_final = torch_accelerator_module.max_memory_reserved() + accelerator_memory_avg = int( + sum(train_result.accelerator_memory_reserved_log) / len(train_result.accelerator_memory_reserved_log) + ) + accelerator_memory_reserved_99th = int(np.percentile(train_result.accelerator_memory_reserved_log, 99)) + + meta_info = get_meta_info() + if model_info is not None: + model_sha = model_info.sha + model_created_at = model_info.created_at.isoformat() + else: + model_sha = None + model_created_at = None + + dataset_info_log = {} + for key, dataset_info in datasets_info.items(): + if dataset_info is not None: + dataset_sha = dataset_info.sha + dataset_created_at = dataset_info.created_at.isoformat() + else: + dataset_sha = None + dataset_created_at = None + dataset_info_log[key] = {"sha": dataset_sha, "created_at": dataset_created_at} + + peft_branch = get_peft_branch() + + if train_result.status == TrainStatus.CANCELED: + save_dir = RESULT_PATH_CANCELLED + print_fn("Experiment run was categorized as canceled") + elif peft_branch != "main": + save_dir = RESULT_PATH_TEST + print_fn(f"Experiment run was categorized as a test run on branch {peft_branch}") + elif train_result.status == TrainStatus.SUCCESS: + save_dir = RESULT_PATH + print_fn("Experiment run was categorized as successful run") + else: + save_dir = tempfile.mkdtemp() + print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.") + + if peft_config is None: + peft_config_dict: Optional[dict[str, Any]] = None + else: + peft_config_dict = peft_config.to_dict() + for key, value in peft_config_dict.items(): + if isinstance(value, set): + peft_config_dict[key] = list(value) + + log_data = { + "run_info": { + "created_at": start_date, + "total_time": time_total, + "experiment_name": experiment_name, + "peft_branch": peft_branch, + "train_config": asdict(train_config), + "peft_config": peft_config_dict, + "error_msg": train_result.error_msg, + }, + "train_info": { + "accelerator_memory_reserved_avg": accelerator_memory_avg, + "accelerator_memory_max": (accelerator_memory_final - accelerator_memory_init), + "accelerator_memory_reserved_99th": accelerator_memory_reserved_99th, + "train_time": train_result.train_time, + "file_size": file_size, + "num_trainable_params": train_result.num_trainable_params, + "num_total_params": train_result.num_total_params, + "status": train_result.status.value, + "metrics": train_result.metrics, + }, + "meta_info": { + "model_info": {"sha": model_sha, "created_at": model_created_at}, + "dataset_info": dataset_info_log, + **asdict(meta_info), + }, + } + + log_to_console(log_data, print_fn=print) # use normal print to be able to redirect if so desired + log_to_file( + log_data=log_data, save_dir=save_dir, experiment_name=experiment_name, timestamp=start_date, print_fn=print_fn + ) diff --git a/peft/method_comparison/README.md b/peft/method_comparison/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c34fec0f9d0ef4de9342c539ca1e29642f4f6133 --- /dev/null +++ b/peft/method_comparison/README.md @@ -0,0 +1,116 @@ +--- +title: PEFT Method Comparison +sdk: gradio +app_file: app.py +pinned: false +emoji: ⚖️ +--- + +# Comparison of PEFT Methods + +The goal of this project is to provide replicable experiments that produce outcomes allowing us to compare different PEFT methods with one another. This gives you more information to make an informed decision about which methods best fit your use case and what trade-offs to expect. + +Visit our [Gradio Space](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison) to check the results. + +## Community Contributions + +We envision the PEFT method comparison project as an ongoing endeavor with heavy involvement from the community. As maintainers, it is impossible for us to know all the perfect hyperparameters for each method or to predict all the use cases that PEFT users may have. As a consequence, community contributions are very welcome. + +Below, we outline all the ways you can contribute to this project. + +### Creating New Experiments + +Creating a new experiment requires setting up a new PEFT configuration for us to test. This will result in one more data point being added to the total comparison. + +Working on this is especially relevant if: + +1. You are the author of a paper whose method is introduced in PEFT, or worked on the PEFT integration, and know what hyperparameters work best. +2. You have experience with a specific method and want to share your knowledge with the community. + +Of course, you can contribute even without meeting these criteria. Please follow the instructions below. + +#### How to Add New Experiments + +Start by navigating to one of the existing experiment folders, e.g. `peft/method_comparison/MetaMathQA`, if your experiment involves using the [MetaMathQA dataset](https://huggingface.co/datasets/meta-math/MetaMathQA). There, create a new directory inside the `experiments/` folder using a descriptive name. For example, if you want to test LoRA with rank 123 using Llama-3.2 3B as the base model, you could name the folder `experiments/lora/llama-3.2-3B-rank123`. + +Inside this directory, you will find a default configuration file called `default_training_params.json`, which contains the default parameters used in the `run.py` training script. Create a new JSON file containing all the parameters you want to modify compared to the defaults, and save it as `training_params.json` in the newly created folder. If you are satisfied with all the default training parameters, you can skip this step. + +Finally, you need to create a PEFT configuration file for the PEFT method you want to add. This should be a JSON file called `adapter_config.json`, placed in the same directory. Below is an example of how this could look: + +```python +from peft import LoraConfig +config = LoraConfig(r=123) +config.save_pretrained("experiments/lora/llama-3.2-3B-rank123/") +``` + +Once you've created the configuration files for your experiment, please [create a PR on PEFT](https://github.com/huggingface/peft/pulls). After it is reviewed and merged, we will run it on our hardware to ensure that the results are comparable. Of course, it is best if you run the experiment at least once on your hardware to verify that the proposed settings work well. + +#### Considerations When Adding New Experiments + +When adding a new experiment, please consider the following points: + +1. Avoid changing too many training parameters at once, as this would make it difficult to compare results with existing ones. For example, if all existing results were created with 5000 training steps but your result uses 10000 steps, it would be unclear whether an improvement in the test score is due to the PEFT method itself or simply due to longer training. Similarly, using a completely different base model, especially if it is significantly more capable, does not contribute to a fair comparison. +2. Avoid suggesting configurations that are very close to existing ones. For example, if there is already an experiment with LoRA and rank 123, do not add an experiment with LoRA and rank 124. +3. Experiments for less-tested methods are more valuable than additional experiments for widely tested methods. +4. Do not edit existing experiments, always create new ones. +5. If you found hyper parameters that work especially well with a given method but are not trivial to find out, consider updating the PEFT documentation of that method so that other users can benefit from your findings. + +### Updating the Training Script + +We provide a training script that includes features typically useful for improving training outcomes, such as AMP support, a cosine learning rate schedule, etc. However, there is always room for improvement. For example, at the time of writing, the script does not support gradient accumulation. Therefore, PRs that extend the training script are welcome. + +#### How to Update the Training Script + +Follow the same process as when contributing to PEFT in general (see the [contribution guidelines](https://huggingface.co/docs/peft/developer_guides/contributing)). If the same training script is used across multiple datasets, please ensure that all relevant scripts are updated accordingly. + +#### Considerations When Updating the Training Script + +1. Updates should be backward-compatible. By default, any new features should be disabled to ensure that existing results remain valid. For example, if you add gradient accumulation, ensure it is disabled by default so that new experiments must opt in. +2. Before adding a bug fix that could invalidate existing results, consider whether the trade-off is worthwhile. If we already have many experimental results, rerunning all of them can be expensive. If the bug fix is not critical, it may not be worth invalidating previous results. However, if you discover a significant bug that could meaningfully impact outcomes, it should be addressed. +3. Avoid unnecessary complexity. While we could add support for DeepSpeed, FSDP, etc., doing so would add significant complexity, exclude users with limited hardware, and is unlikely to alter the relative performance of different PEFT methods. +4. Minimize reliance on specific training frameworks. For example, we deliberately avoid using the `Trainer` class from transformers or PyTorch Lightning. This ensures transparency, making it easier to understand the training process and replicate results over time. If a training framework were used, we would have to pin the version or risk future incompatibilities. + +### Adding a New Dataset + +Adding a new dataset increases the breadth and usefulness of the PEFT method comparison. The goal is not necessarily to outperform benchmarks or replicate paper results, but to fairly compare different PEFT methods in a way that is useful for PEFT users. If this involves replicating an experiment from a paper, that is great, but it is not a requirement. + +#### How to Add a New Dataset + +The easiest way to add support for a new dataset is to copy an existing setup, such as `method_comparison/MetaMathQA`, rename it, and modify `data.py`, as well as any other necessary parts of the code. Ideally, as much existing code as possible should be reused. The general folder structure and experiment logging format should remain consistent. + +After adding the dataset, ensure it functions correctly and produces meaningful results by running at least one experimental setup, such as using LoRA with default settings. + +#### Considerations When Adding a New Dataset + +1. Before beginning, it is best to open an [issue on PEFT](https://github.com/huggingface/peft/issues) to share your plans. This allows for early feedback and prevents wasted effort on impractical ideas. +2. The most valuable new datasets are those that test different capabilities than those already present. Bonus points if the task is similar to what users may face in the real world. Task ideas that would be great to add: + - A task involving both language and image modalities. + - An image generation task (like stable diffusion) + - A task involving audio (like whisper) + - A task that requires knowledge preservation (checked, for instance, via an auxiliary test set) + - Learning something completely new (e.g. a new language) + - A reinforcement learning task (e.g. using [trl](https://github.com/huggingface/trl)) +3. Training should be reasonably fast. Running dozens of experiments is impractical if each one takes multiple days and incurs high costs. Ideally, training should take a few hours at most on high-end consumer hardware. +4. The chosen base model should not be too large, to avoid VRAM constraints. Morevoer, if the base model is too powerful, there is little room for improvement through further fine-tuning. +5. Test scores should be informative and have a broad range: + - Besides loss, there should ideally be at least one additional metric, such as accuracy. + - Comparisons are not meaningful if all methods score near 0% or near 100%. The dataset should yield a range of scores to facilitate meaningful differentiation between methods. +6. The dataset should be publicly available and have a track record as a useful dataset. The license should permit the intended usage. + +## Result dashboard + +For convenience, we included a [Gradio](https://www.gradio.app/) app that shows the results of the experiments. It allows you to filter down the task and base model and show the experiment results for this selection. Give it a try [here](https://huggingface.co/spaces/peft-internal-testing/PEFT-method-comparison). + +### Local deployment + +This app requires additional packages to be installed, please install the packages listed in `requirements-app.txt`, e.g. via: + +```sh +python -m pip install -r requirements-app.txt +``` + +To launch the demo, run: + +```sh +python app.py +``` diff --git a/peft/method_comparison/__init__.py b/peft/method_comparison/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/app.py b/peft/method_comparison/app.py new file mode 100644 index 0000000000000000000000000000000000000000..96444b6c155091ba2eee0da73cc0b7142cf6f4f7 --- /dev/null +++ b/peft/method_comparison/app.py @@ -0,0 +1,379 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Gradio app to show the results""" + +import os +import tempfile + +import gradio as gr +import plotly.express as px +import plotly.graph_objects as go +from processing import load_df +from sanitizer import parse_and_filter + + +metric_preferences = { + "accelerator_memory_reserved_avg": "lower", + "accelerator_memory_max": "lower", + "accelerator_memory_reserved_99th": "lower", + "total_time": "lower", + "train_time": "lower", + "file_size": "lower", + "test_accuracy": "higher", + "train_loss": "lower", + "num_trainable_params": "lower", +} + + +def get_model_ids(task_name, df): + filtered = df[df["task_name"] == task_name] + return sorted(filtered["model_id"].unique()) + + +def filter_data(task_name, model_id, df): + filtered = df[(df["task_name"] == task_name) & (df["model_id"] == model_id)] + return filtered + + +# Compute the Pareto frontier for two selected metrics. +def compute_pareto_frontier(df, metric_x, metric_y): + if df.empty: + return df + + df = df.copy() + points = df[[metric_x, metric_y]].values + selected_indices = [] + + def dominates(a, b, metric_x, metric_y): + # Check for each metric whether b is as good or better than a + if metric_preferences[metric_x] == "higher": + cond_x = b[0] >= a[0] + better_x = b[0] > a[0] + else: + cond_x = b[0] <= a[0] + better_x = b[0] < a[0] + if metric_preferences[metric_y] == "higher": + cond_y = b[1] >= a[1] + better_y = b[1] > a[1] + else: + cond_y = b[1] <= a[1] + better_y = b[1] < a[1] + return cond_x and cond_y and (better_x or better_y) + + for i, point in enumerate(points): + dominated = False + for j, other_point in enumerate(points): + if i == j: + continue + if dominates(point, other_point, metric_x, metric_y): + dominated = True + break + if not dominated: + selected_indices.append(i) + pareto_df = df.iloc[selected_indices] + return pareto_df + + +def generate_pareto_plot(df, metric_x, metric_y): + if df.empty: + return {} + + # Compute Pareto frontier and non-frontier points. + pareto_df = compute_pareto_frontier(df, metric_x, metric_y) + non_pareto_df = df.drop(pareto_df.index) + + # Create an empty figure. + fig = go.Figure() + + # Draw the line connecting Pareto frontier points. + if not pareto_df.empty: + # Sort the Pareto frontier points by metric_x for a meaningful connection. + pareto_sorted = pareto_df.sort_values(by=metric_x) + line_trace = go.Scatter( + x=pareto_sorted[metric_x], + y=pareto_sorted[metric_y], + mode="lines", + line={"color": "rgba(0,0,255,0.3)", "width": 4}, + name="Pareto Frontier", + ) + fig.add_trace(line_trace) + + # Add non-frontier points in gray with semi-transparency. + if not non_pareto_df.empty: + non_frontier_trace = go.Scatter( + x=non_pareto_df[metric_x], + y=non_pareto_df[metric_y], + mode="markers", + marker={"color": "rgba(128,128,128,0.5)", "size": 12}, + hoverinfo="text", + text=non_pareto_df.apply( + lambda row: f"experiment_name: {row['experiment_name']}
" + f"peft_type: {row['peft_type']}
" + f"{metric_x}: {row[metric_x]}
" + f"{metric_y}: {row[metric_y]}", + axis=1, + ), + showlegend=False, + ) + fig.add_trace(non_frontier_trace) + + # Add Pareto frontier points with legend + if not pareto_df.empty: + pareto_scatter = px.scatter( + pareto_df, + x=metric_x, + y=metric_y, + color="experiment_name", + hover_data={"experiment_name": True, "peft_type": True, metric_x: True, metric_y: True}, + ) + for trace in pareto_scatter.data: + trace.marker = {"size": 12} + fig.add_trace(trace) + + # Update layout with axes labels. + fig.update_layout( + title=f"Pareto Frontier for {metric_x} vs {metric_y}", + template="seaborn", + height=700, + autosize=True, + xaxis_title=metric_x, + yaxis_title=metric_y, + ) + + return fig + + +def compute_pareto_summary(filtered, pareto_df, metric_x, metric_y): + if filtered.empty: + return "No data available." + + stats = filtered[[metric_x, metric_y]].agg(["min", "max", "mean"]).to_string() + total_points = len(filtered) + pareto_points = len(pareto_df) + excluded_points = total_points - pareto_points + summary_text = ( + f"{stats}\n\n" + f"Total points: {total_points}\n" + f"Pareto frontier points: {pareto_points}\n" + f"Excluded points: {excluded_points}" + ) + return summary_text + + +def export_csv(df): + if df.empty: + return None + csv_data = df.to_csv(index=False) + with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as tmp: + tmp.write(csv_data) + tmp_path = tmp.name + return tmp_path + + +def format_df(df): + return df.style.format(precision=3, thousands=",", decimal=".") + + +def build_app(df): + with gr.Blocks(theme=gr.themes.Soft()) as demo: + gr.Markdown("# PEFT method comparison") + gr.Markdown( + "Find more information [on the PEFT GitHub repo](https://github.com/huggingface/peft/tree/main/method_comparison)" + ) + + # Hidden state to store the current filter query. + filter_state = gr.State("") + + gr.Markdown("## Choose the task and base model") + with gr.Row(): + task_dropdown = gr.Dropdown( + label="Select Task", + choices=sorted(df["task_name"].unique()), + value=sorted(df["task_name"].unique())[0], + ) + model_dropdown = gr.Dropdown( + label="Select Model ID", choices=get_model_ids(sorted(df["task_name"].unique())[0], df) + ) + + # Make dataframe columns all equal in width so that they are good enough for numbers but don't + # get hugely extended by columns like `train_config`. + column_widths = ["150px" for _ in df.columns] + column2index = dict(zip(df.columns, range(len(df.columns)))) + column_widths[column2index['experiment_name']] = '300px' + + data_table = gr.DataFrame( + label="Results", + value=format_df(df), + interactive=False, + max_chars=100, + wrap=False, + column_widths=column_widths, + ) + + with gr.Row(): + filter_textbox = gr.Textbox( + label="Filter DataFrame", + placeholder="Enter filter (e.g.: peft_type=='LORA')", + interactive=True, + ) + apply_filter_button = gr.Button("Apply Filter") + reset_filter_button = gr.Button("Reset Filter") + + gr.Markdown("## Pareto plot") + gr.Markdown( + "Select 2 criteria to plot the Pareto frontier. This will show the best PEFT methods along this axis and " + "the trade-offs with the other axis. The PEFT methods that Pareto-dominate are shown in colors. All other " + "methods are inferior with regard to these two metrics. Hover over a point to show details." + ) + + with gr.Row(): + x_default = ( + "accelerator_memory_max" + if "accelerator_memory_max" in metric_preferences + else list(metric_preferences.keys())[0] + ) + y_default = ( + "test_accuracy" if "test_accuracy" in metric_preferences else list(metric_preferences.keys())[1] + ) + metric_x_dropdown = gr.Dropdown( + label="1st metric for Pareto plot", + choices=list(metric_preferences.keys()), + value=x_default, + ) + metric_y_dropdown = gr.Dropdown( + label="2nd metric for Pareto plot", + choices=list(metric_preferences.keys()), + value=y_default, + ) + + pareto_plot = gr.Plot(label="Pareto Frontier Plot") + summary_box = gr.Textbox(label="Summary Statistics", lines=6) + csv_output = gr.File(label="Export Filtered Data as CSV") + + def update_on_task(task_name, current_filter): + new_models = get_model_ids(task_name, df) + filtered = filter_data(task_name, new_models[0] if new_models else "", df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + df_queried = filtered[mask] + if not df_queried.empty: + filtered = df_queried + except Exception: + # invalid filter query + pass + return gr.update(choices=new_models, value=new_models[0] if new_models else None), format_df(filtered) + + task_dropdown.change( + fn=update_on_task, inputs=[task_dropdown, filter_state], outputs=[model_dropdown, data_table] + ) + + def update_on_model(task_name, model_id, current_filter): + filtered = filter_data(task_name, model_id, df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + filtered = filtered[mask] + except Exception: + pass + return format_df(filtered) + + model_dropdown.change( + fn=update_on_model, inputs=[task_dropdown, model_dropdown, filter_state], outputs=data_table + ) + + def update_pareto_plot_and_summary(task_name, model_id, metric_x, metric_y, current_filter): + filtered = filter_data(task_name, model_id, df) + if current_filter.strip(): + try: + mask = parse_and_filter(filtered, current_filter) + filtered = filtered[mask] + except Exception as e: + return generate_pareto_plot(filtered, metric_x, metric_y), f"Filter error: {e}" + + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + return fig, summary + + for comp in [model_dropdown, metric_x_dropdown, metric_y_dropdown]: + comp.change( + fn=update_pareto_plot_and_summary, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown, filter_state], + outputs=[pareto_plot, summary_box], + ) + + def apply_filter(filter_query, task_name, model_id, metric_x, metric_y): + filtered = filter_data(task_name, model_id, df) + if filter_query.strip(): + try: + mask = parse_and_filter(filtered, filter_query) + filtered = filtered[mask] + except Exception as e: + # Update the table, plot, and summary even if there is a filter error. + return ( + filter_query, + filtered, + generate_pareto_plot(filtered, metric_x, metric_y), + f"Filter error: {e}", + ) + + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + return filter_query, format_df(filtered), fig, summary + + apply_filter_button.click( + fn=apply_filter, + inputs=[filter_textbox, task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown], + outputs=[filter_state, data_table, pareto_plot, summary_box], + ) + + def reset_filter(task_name, model_id, metric_x, metric_y): + filtered = filter_data(task_name, model_id, df) + pareto_df = compute_pareto_frontier(filtered, metric_x, metric_y) + fig = generate_pareto_plot(filtered, metric_x, metric_y) + summary = compute_pareto_summary(filtered, pareto_df, metric_x, metric_y) + # Return empty strings to clear the filter state and textbox. + return "", "", format_df(filtered), fig, summary + + reset_filter_button.click( + fn=reset_filter, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown], + outputs=[filter_state, filter_textbox, data_table, pareto_plot, summary_box], + ) + + gr.Markdown("## Export data") + # Export button for CSV download. + export_button = gr.Button("Export Filtered Data") + export_button.click( + fn=lambda task, model: export_csv(filter_data(task, model, df)), + inputs=[task_dropdown, model_dropdown], + outputs=csv_output, + ) + + demo.load( + fn=update_pareto_plot_and_summary, + inputs=[task_dropdown, model_dropdown, metric_x_dropdown, metric_y_dropdown, filter_state], + outputs=[pareto_plot, summary_box], + ) + + return demo + + +path = os.path.join(os.path.dirname(__file__), "MetaMathQA", "results") +df = load_df(path, task_name="MetaMathQA") +demo = build_app(df) +demo.launch() diff --git a/peft/method_comparison/processing.py b/peft/method_comparison/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..8f7460b9b6b68d0420cc1ea51e6db6abf6b1242a --- /dev/null +++ b/peft/method_comparison/processing.py @@ -0,0 +1,147 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data processing used for analyzing and presenting the results""" + +import json +import os + +import pandas as pd + + +def preprocess(rows, task_name: str, print_fn=print): + results = [] + skipped = 0 + for row in rows: + run_info = row["run_info"] + train_info = row["train_info"] + meta_info = row["meta_info"] + if run_info["peft_config"]: + peft_type = run_info["peft_config"]["peft_type"] + else: + peft_type = "full-finetuning" + if train_info["status"] != "success": + skipped += 1 + continue + + train_metrics = train_info["metrics"][-1] + + # extract the fields that make most sense + dct = { + "task_name": task_name, + "experiment_name": run_info["experiment_name"], + "model_id": run_info["train_config"]["model_id"], + "train_config": run_info["train_config"], + "peft_type": peft_type, + "peft_config": run_info["peft_config"], + "accelerator_memory_reserved_avg": train_info["accelerator_memory_reserved_avg"], + "accelerator_memory_max": train_info["accelerator_memory_max"], + "accelerator_memory_reserved_99th": train_info["accelerator_memory_reserved_99th"], + "total_time": run_info["total_time"], + "train_time": train_info["train_time"], + "file_size": train_info["file_size"], + "num_trainable_params": train_info["num_trainable_params"], + "test_accuracy": train_metrics["test accuracy"], + "train_loss": train_metrics["train loss"], + "train_samples": train_metrics["train samples"], + "train_total_tokens": train_metrics["train total tokens"], + "peft_version": meta_info["package_info"]["peft-version"], + "peft_branch": run_info["peft_branch"], + "transformers_version": meta_info["package_info"]["transformers-version"], + "datasets_version": meta_info["package_info"]["datasets-version"], + "torch_version": meta_info["package_info"]["torch-version"], + "bitsandbytes_version": meta_info["package_info"]["bitsandbytes-version"], + "package_info": meta_info["package_info"], + "system_info": meta_info["system_info"], + "created_at": run_info["created_at"], + } + results.append(dct) + + if skipped: + print_fn(f"Skipped {skipped} of {len(rows)} entries because the train status != success") + + return results + + +def load_jsons(path): + results = [] + for fn in os.listdir(path): + if fn.endswith(".json"): + with open(os.path.join(path, fn)) as f: + row = json.load(f) + results.append(row) + return results + + +def load_df(path, task_name, print_fn=print): + jsons = load_jsons(path) + preprocessed = preprocess(jsons, task_name=task_name, print_fn=print_fn) + dtype_dict = { + "task_name": "string", + "experiment_name": "string", + "model_id": "string", + "train_config": "string", + "peft_type": "string", + "peft_config": "string", + "accelerator_memory_reserved_avg": int, + "accelerator_memory_max": int, + "accelerator_memory_reserved_99th": int, + "total_time": float, + "train_time": float, + "file_size": int, + "test_accuracy": float, + "train_loss": float, + "train_samples": int, + "train_total_tokens": int, + "num_trainable_params": int, + "peft_version": "string", + "peft_branch": "string", + "transformers_version": "string", + "datasets_version": "string", + "torch_version": "string", + "bitsandbytes_version": "string", + "package_info": "string", + "system_info": "string", + "created_at": "string", + } + df = pd.DataFrame(preprocessed) + df = df.astype(dtype_dict) + df["created_at"] = pd.to_datetime(df["created_at"]) + # round training time to nearest second + df["train_time"] = df["train_time"].round().astype(int) + df["total_time"] = df["total_time"].round().astype(int) + + # reorder columns for better viewing, pinned_columns arg in Gradio seems not to work correctly + important_columns = [ + "experiment_name", + "peft_type", + "total_time", + "train_time", + "test_accuracy", + "train_loss", + "accelerator_memory_max", + "accelerator_memory_reserved_99th", + "accelerator_memory_reserved_avg", + "num_trainable_params", + "file_size", + "created_at", + "task_name", + ] + other_columns = [col for col in df if col not in important_columns] + df = df[important_columns + other_columns] + + columns = ["experiment_name", "model_id", "peft_type", "created_at"] + # we want to keep only the most recent run for each experiment + df = df.sort_values("created_at").drop_duplicates(columns, keep="last") + return df diff --git a/peft/method_comparison/requirements-app.txt b/peft/method_comparison/requirements-app.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fa065143fd408250351c885e334bba64ab5086e --- /dev/null +++ b/peft/method_comparison/requirements-app.txt @@ -0,0 +1,3 @@ +dash +gradio>=5.38.0 +pandas diff --git a/peft/method_comparison/sanitizer.py b/peft/method_comparison/sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..7659d650c0fb293806d314f7334950ebaffbda33 --- /dev/null +++ b/peft/method_comparison/sanitizer.py @@ -0,0 +1,100 @@ +import ast + +import pandas as pd + + +def _evaluate_node(df, node): + """ + Recursively evaluates an AST node to generate a pandas boolean mask. + """ + # Base Case: A simple comparison like 'price > 100' + if isinstance(node, ast.Compare): + if not isinstance(node.left, ast.Name): + raise ValueError("Left side of comparison must be a column name.") + col = node.left.id + if col not in df.columns: + raise ValueError(f"Column '{col}' not found in DataFrame.") + + if len(node.ops) > 1: + raise ValueError("Chained comparisons like '10 < price < 100' are not supported.") + + op_node = node.ops[0] + val_node = node.comparators[0] + try: + value = ast.literal_eval(val_node) + except ValueError: + raise ValueError("Right side of comparison must be a literal (number, string, list).") + + operator_map = { + ast.Gt: lambda c, v: df[c] > v, + ast.GtE: lambda c, v: df[c] >= v, + ast.Lt: lambda c, v: df[c] < v, + ast.LtE: lambda c, v: df[c] <= v, + ast.Eq: lambda c, v: df[c] == v, + ast.NotEq: lambda c, v: df[c] != v, + ast.In: lambda c, v: df[c].isin(v), + ast.NotIn: lambda c, v: ~df[c].isin(v) + } + op_type = type(op_node) + if op_type not in operator_map: + raise ValueError(f"Unsupported operator '{op_type.__name__}'.") + return operator_map[op_type](col, value) + + # Recursive Step: "Bitwise" operation & and | (the same as boolean operations) + elif isinstance(node, ast.BinOp): + if isinstance(node.op, ast.BitOr): + return _evaluate_node(df, node.left) | _evaluate_node(df, node.right) + elif isinstance(node.op, ast.BitAnd): + return _evaluate_node(df, node.left) & _evaluate_node(df, node.right) + + # Recursive Step: A boolean operation like '... and ...' or '... or ...' + elif isinstance(node, ast.BoolOp): + op_type = type(node.op) + # Evaluate the first value in the boolean expression + result = _evaluate_node(df, node.values[0]) + # Combine it with the rest of the values based on the operator + for i in range(1, len(node.values)): + if op_type is ast.And or op_type is ast.BitAnd: + result &= _evaluate_node(df, node.values[i]) + elif op_type is ast.Or or op_type is ast.BitOr: + result |= _evaluate_node(df, node.values[i]) + return result + + elif isinstance(node, ast.UnaryOp): + if not isinstance(node.op, ast.Not): + raise ValueError("Only supported unary op is negation.") + return ~_evaluate_node(df, node.operand) + + # If the node is not a comparison or boolean op, it's an unsupported expression type + else: + raise ValueError(f"Unsupported expression type: {type(node).__name__}") + + +def parse_and_filter(df, filter_str): + """ + Filters a pandas DataFrame using a string expression parsed by AST. + This is done to avoid the security vulnerables that `DataFrame.query` + brings (arbitrary code execution). + + Args: + df (pd.DataFrame): The DataFrame to filter. + filter_str (str): A string representing a filter expression. + e.g., "price > 100 and stock < 50" + Supported operators: >, >=, <, <=, ==, !=, in, not in, and, or. + + Returns: + pd.Series: A boolean Series representing the filter mask. + """ + if not filter_str: + return pd.Series([True] * len(df), index=df.index) + + try: + # 'eval' mode ensures the source is a single expression. + tree = ast.parse(filter_str, mode='eval') + expression_node = tree.body + except (SyntaxError, ValueError) as e: + raise ValueError(f"Invalid filter syntax: {e}") + + # The recursive evaluation starts here + mask = _evaluate_node(df, expression_node) + return mask diff --git a/peft/method_comparison/test_sanitizer.py b/peft/method_comparison/test_sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..59c0dd191e887aaeebbfce9dff9e88e6be0e2152 --- /dev/null +++ b/peft/method_comparison/test_sanitizer.py @@ -0,0 +1,38 @@ +import pandas as pd +import pytest + +from .sanitizer import parse_and_filter + + +@pytest.fixture +def df_products(): + data = { + 'product_id': [101, 102, 103, 104, 105, 106], + 'category': ['Electronics', 'Books', 'Electronics', 'Home Goods', 'Books', 'Electronics'], + 'price': [799.99, 19.99, 49.50, 120.00, 24.99, 150.00], + 'stock': [15, 300, 50, 25, 150, 0] + } + return pd.DataFrame(data) + + +def test_exploit_fails(df_products): + with pytest.raises(ValueError) as e: + mask1 = parse_and_filter(df_products, + """price < 50 and @os.system("/bin/echo password")""") + assert 'Invalid filter syntax' in str(e) + + +@pytest.mark.parametrize('expression,ids', [ + ("price < 50", [102, 103, 105]), + ("product_id in [101, 102]", [101, 102]), + ("price < 50 and category == 'Electronics'", [103]), + ("stock < 100 or category == 'Home Goods'", [101, 103, 104, 106]), + ("(price > 100 and stock < 20) or category == 'Books'", [101, 102, 105, 106]), + ("not (price > 50 or stock > 100)", [103]), + ("not price > 50", [102, 103, 105]), + ("(price < 50) & (category == 'Electronics')", [103]), + ("(stock < 100) | (category == 'Home Goods')", [101, 103, 104, 106]), +]) +def test_operations(df_products, expression, ids): + mask1 = parse_and_filter(df_products, expression) + assert sorted(df_products[mask1].product_id) == sorted(ids) diff --git a/peft/method_comparison/text_generation_benchmark/README.md b/peft/method_comparison/text_generation_benchmark/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9f727fbf7276fa42501d65fca364b6201d1e0c57 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/README.md @@ -0,0 +1,179 @@ +## Base Model Inference Caching + +The benchmarking suite uses a separate script, `run_base.py`, to measure base model inference times and save results for reuse. This should be run once per model configuration to avoid redundant computations and ensure consistent baseline metrics for all PEFT experiments. + +**Usage:** +```bash +python run_base.py +``` +This will cache the base model inference results for the specified configuration. Subsequent runs of `run.py` will automatically load these cached results. + +# PEFT Benchmarking Suite + +This directory contains a comprehensive benchmarking framework for Parameter-Efficient Fine-Tuning (PEFT) methods. For the task of text generation, the suite measures inference performance, memory usage, and other key metrics across different PEFT configurations. + +## Overview + +The benchmarking suite provides: +- **Inference time measurement** across different prompt categories +- **Memory usage during inference** (RAM and GPU) +- **Parameter efficiency metrics** (trainable vs total parameters) +- **Time per token analysis** for fair comparison across different generation lengths +- **Structured result logging** with detailed metadata + +## Architecture + +The suite follows a clean separation between: +1. **Default benchmark configuration** - shared settings for consistent comparison +2. **Individual adapter configurations** - PEFT-specific parameters for each experiment + +This ensures that all experiments are comparable while allowing flexibility in adapter parameters. + +## Quick Start + +### Running a Single Experiment + +```bash +# From the peft_bench directory +python run.py experiments/lora/lora_r8 --verbose +``` + +## Configuration Structure + +The benchmarking suite uses a hierarchical configuration system: + +1. **Default benchmark parameters** (`default_benchmark_params.json`) - Base configuration shared by all experiments +2. **Experiment-specific overrides** (`benchmark_params.json` in each experiment) - Optional overrides for specific experiments +3. **Adapter configuration** (`adapter_config.json` in each experiment) - PEFT method parameters + +This structure ensures consistent comparison while allowing flexibility where needed. + +### Default Configuration (`default_benchmark_params.json`) + +Contains shared benchmark settings that apply to all experiments. Here are the key configuration fields: + +- `model_id`: The Hugging Face model ID to use as the base model (e.g., "facebook/opt-350m") +- `dtype`: Model precision ("float16", "float32", or "bfloat16") +- `seed`: Random seed for reproducibility +- `max_new_tokens`: Maximum number of tokens to generate during inference +- `num_inference_runs`: Number of inference runs per prompt for statistical reliability +- `use_4bit`: Whether to use 4-bit quantization (bool) +- `use_8bit`: Whether to use 8-bit quantization (bool) + +Each experiment can override these settings by providing its own `benchmark_params.json` file. + +### Experiment Structure + +Each experiment directory should contain: + +1. `adapter_config.json`: PEFT adapter configuration. For details on available parameters and their meanings, refer to the [PEFT documentation](https://huggingface.co/docs/peft/main/en/developer_guides/adapters). + +2. (Optional) `benchmark_params.json`: Override specific benchmark parameters for this experiment. + +Example directory structure: +``` +experiments/ +└── lora/ + ├── lora_r8/ # LoRA rank 8 experiment + │ ├── adapter_config.json # PEFT adapter configuration + │ └── benchmark_params.json # Optional benchmark overrides + └── lora_r16/ # LoRA rank 16 experiment + └── adapter_config.json +``` + +### Experiment-Specific Overrides Example + +If an experiment needs different benchmark settings, create `benchmark_params.json`: +```json +{ + "_comment": "Override settings for this specific experiment", + "max_new_tokens": 50, + "num_inference_runs": 15, + "num_prompt_samples": 2 +} +``` + +These parameters will override the defaults from `default_benchmark_params.json`. However, the defaults should generally not be changed to keep the results from the individual experiments comparable. + +### Create a New Experiment Adapter Configuration + +To create a new experiment, follow these steps: + +1. **Create the experiment directory** + ```bash + mkdir -p experiments/lora/lora_r8 + ``` + +2. **Generate the adapter configuration programmatically** + Use the PEFT library to create and save your adapter config: + + ```python + from peft import LoraConfig + + config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=8, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM" + ) + config.save_pretrained("experiments/lora/lora_r8") + ``` + + This will create an `adapter_config.json` in your experiment directory. Adjust parameters as needed for your experiment. + +3. **(Optional) Add benchmark overrides** + If you need to override default benchmark settings, create a `benchmark_params.json` in the same directory. + +4. **Run the benchmark** + ```bash + python run.py experiments/lora/lora_r8 --verbose + ``` + +## Prompt Categories + +The benchmark automatically runs across all prompt categories for consistent comparison: +- **short** - Brief prompts (1-2 sentences) +- **medium** - Moderate length prompts (paragraph-level) +- **long** - Extended prompts (multiple paragraphs) + +Results are tracked separately for each category, allowing analysis of how different PEFT methods perform across varying input lengths. + +## Results Structure + +Results are saved in a structured JSON format with three main sections: + +### `run_info` +- Execution metadata (timestamp, duration, status) +- Hardware information (GPU type, CUDA version, etc.) +- Error information (if applicable) +- PEFT and benchmark configurations + +### `generation_info` +- Memory usage logs at different stages +- Per-category metrics (inference time, time per token, etc.) +- Overall aggregated metrics +- Individual sample results for detailed analysis + +### `meta_info` +- Model information (ID, PEFT method) +- Parameter counts (adapter, total, ratio) +- Model size information (base model, adapter) +- System and package information + +## Key Metrics + +### Inference Performance +- **Inference Time**: Total time for generation per category +- **Time Per Token**: Normalized time accounting for different generation lengths +- **Inference Overhead**: Percentage increase compared to base model + +### Memory Usage +- **Peak GPU Memory**: Maximum GPU memory during benchmark +- **Peak RAM Memory**: Maximum RAM usage +- **Memory Logs**: Detailed tracking at each stage + +### Parameter Efficiency +- **Adapter Parameters**: Number of parameters in the PEFT adapter +- **Parameter Ratio**: Percentage of total model parameters that are in the adapter +- **Adapter Size**: Memory footprint of the adapter in MB diff --git a/peft/method_comparison/text_generation_benchmark/cancelled_results/.gitkeep b/peft/method_comparison/text_generation_benchmark/cancelled_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/text_generation_benchmark/configs/prompts.json b/peft/method_comparison/text_generation_benchmark/configs/prompts.json new file mode 100644 index 0000000000000000000000000000000000000000..7768b420a0ff49511be9e689e659fb2c97022207 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/configs/prompts.json @@ -0,0 +1,23 @@ +{ + "short": [ + "Explain quantum computing in one paragraph.", + "Write a haiku about machine learning.", + "What's the difference between supervised and unsupervised learning?", + "Define parameter-efficient fine-tuning in one sentence.", + "List three applications of natural language processing." + ], + "medium": [ + "Explain the concept of low-rank adaptation (LoRA) for large language models. Include its benefits and limitations.", + "Compare and contrast prompt tuning and prefix tuning approaches for adapting large language models.", + "What are the key differences between full fine-tuning and parameter-efficient methods? Explain with examples.", + "Describe the process of quantization for neural networks and how it affects model size and inference speed.", + "Explain how sparse expert models like Mixture of Experts work and their advantages over dense models." + ], + "long": [ + "Analyze the evolution of parameter-efficient fine-tuning methods from 2020 to present. Include a detailed comparison of at least five different approaches, their theoretical foundations, and practical implications for deploying large language models.", + "Provide a comprehensive tutorial on implementing LoRA for a transformer-based language model. Include code examples, hyperparameter selection guidance, and best practices for training and deployment.", + "Compare the computational efficiency, parameter count, and performance characteristics of different PEFT methods (LoRA, Prefix Tuning, Prompt Tuning, IA3, AdaLoRA) across various downstream tasks. Include a discussion of when each method is most appropriate.", + "Explain the mathematical foundations of various parameter-efficient fine-tuning techniques. Discuss how each technique modifies the original neural network architecture and the optimization challenges involved.", + "Discuss the ethical implications of parameter-efficient fine-tuning methods in democratizing access to large language models. Include considerations about computational resources, environmental impact, and accessibility for researchers in resource-constrained settings." + ] +} \ No newline at end of file diff --git a/peft/method_comparison/text_generation_benchmark/data.py b/peft/method_comparison/text_generation_benchmark/data.py new file mode 100644 index 0000000000000000000000000000000000000000..ce3343f1ca63a74908395d8789da6d58491efc4f --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/data.py @@ -0,0 +1,119 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Data handling utilities for PEFT benchmarking. +""" + +import json +import os +from typing import Optional + +from transformers import PreTrainedTokenizer +from utils import BenchmarkConfig + + +DEFAULT_PROMPTS_PATH = os.path.join(os.path.dirname(__file__), "configs", "prompts.json") + + +def load_test_prompts(config: dict) -> dict[str, list[str]]: + """ + Load prompts from JSON file. + + Args: + config: Configuration containing prompts file path + + Returns: + dictionary with prompts by category + """ + prompts_file = getattr(config, "prompts_file", DEFAULT_PROMPTS_PATH) + + with open(prompts_file) as f: + prompts = json.load(f) + + return prompts + + +def truncate_prompt_for_model( + prompt: str, + tokenizer: PreTrainedTokenizer, + max_length: Optional[int] = None, + reserve_output_tokens: int = 50, +) -> str: + """ + Truncate a prompt to fit within the model's context window. + + Args: + prompt: Input prompt + tokenizer: Model tokenizer + max_length: Maximum sequence length (if None, uses model's max_length) + reserve_output_tokens: Number of tokens to reserve for response + + Returns: + Truncated prompt + """ + if max_length is None: + if hasattr(tokenizer, "model_max_length"): + max_length = tokenizer.model_max_length + else: + max_length = 2048 + + max_prompt_length = max_length - reserve_output_tokens + input_ids = tokenizer.encode(prompt, return_tensors="pt")[0] + + if len(input_ids) <= max_prompt_length: + return prompt + + truncated_ids = input_ids[:max_prompt_length] + truncated_prompt = tokenizer.decode(truncated_ids, skip_special_tokens=True) + + return truncated_prompt + + +def prepare_benchmark_prompts( + config: BenchmarkConfig, + tokenizer: PreTrainedTokenizer, + max_input_length: Optional[int] = None, + seed: int = 42, +) -> dict[str, list[str]]: + """ + Prepare prompts for benchmarking, ensuring appropriate length and variety. + Always returns all prompt categories for consistent benchmarking. + + Args: + config: Benchmark configuration + tokenizer: Model tokenizer + max_input_length: Maximum input length (overrides model default if provided) + seed: Random seed (kept for backwards compatibility) + + Returns: + Dictionary with processed prompts by category (all categories included) + """ + all_prompts = load_test_prompts(config) + + processed_prompts = {} + for category, prompts in all_prompts.items(): + truncated_prompts = [ + truncate_prompt_for_model( + prompt, + tokenizer, + max_length=max_input_length, + reserve_output_tokens=getattr(config, "reserve_output_tokens", 50), + ) + for prompt in prompts + ] + + processed_prompts[category] = truncated_prompts + + return processed_prompts diff --git a/peft/method_comparison/text_generation_benchmark/default_benchmark_params.json b/peft/method_comparison/text_generation_benchmark/default_benchmark_params.json new file mode 100644 index 0000000000000000000000000000000000000000..c7d9a148e53d5975b66c641e3f6269519cc7bb82 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/default_benchmark_params.json @@ -0,0 +1,12 @@ +{ + "model_id": "meta-llama/Llama-3.2-3B", + "dtype": "float16", + "seed": 42, + "num_inference_runs": 10, + "max_new_tokens": 20, + "category_generation_params": { + "short": {"max_new_tokens": 20}, + "medium": {"max_new_tokens": 50}, + "long": {"max_new_tokens": 100} + } +} diff --git a/peft/method_comparison/text_generation_benchmark/experiments/lora/lora_r8/adapter_config.json b/peft/method_comparison/text_generation_benchmark/experiments/lora/lora_r8/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f3fd26121a544399ccc14f9cbb017a7ba3abeac2 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/experiments/lora/lora_r8/adapter_config.json @@ -0,0 +1,17 @@ +{ + "base_model_name_or_path": null, + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "lora_alpha": 16, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} diff --git a/peft/method_comparison/text_generation_benchmark/results/.gitkeep b/peft/method_comparison/text_generation_benchmark/results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/text_generation_benchmark/run.py b/peft/method_comparison/text_generation_benchmark/run.py new file mode 100644 index 0000000000000000000000000000000000000000..1cfba8931f29cf8571a89253859781a16e2203a8 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/run.py @@ -0,0 +1,358 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Main entry point to run the experiments. Contains general setup and the proper inference code. +""" + +import argparse +import gc +import json +import os +import sys +import time +from typing import Optional + +import bitsandbytes +import torch +import transformers +from data import prepare_benchmark_prompts +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed +from utils import ( + BenchmarkConfig, + BenchmarkResult, + BenchmarkStatus, + get_memory_usage, + init_accelerator, + log_results, + validate_experiment_path, +) + +import peft +from peft import PeftConfig, get_peft_model + + +def load_base_results(model_id: str) -> Optional[dict]: + """Load base model results if they exist.""" + base_results_dir = os.path.join(os.path.dirname(__file__), "base_results") + model_name = model_id.replace("/", "_").replace("-", "_") + filename = f"base_{model_name}.json" + filepath = os.path.join(base_results_dir, filename) + + if os.path.exists(filepath): + with open(filepath) as f: + return json.load(f) + return None + + +def measure_inference_time(model, tokenizer, prompts, max_new_tokens, num_runs, print_fn, category_generation_params): + """Measure inference time for each prompt category.""" + inference_times = {} + time_per_token = {} + generated_tokens = {} + individual_samples = {} + + for category, category_prompts in prompts.items(): + print_fn(f"\nMeasuring inference time for {category} prompts...") + category_times = [] + category_tokens = [] + category_time_per_token = [] + category_samples = [] + + for prompt in category_prompts: + prompt_times = [] + prompt_tokens = [] + prompt_time_per_token = [] + + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + + cat_max_new_tokens = category_generation_params.get(category, {}).get("max_new_tokens", max_new_tokens) + + for _ in range(num_runs): + start_time = time.perf_counter() + outputs = model.generate( + **inputs, + max_new_tokens=cat_max_new_tokens, + min_new_tokens=cat_max_new_tokens, + pad_token_id=tokenizer.pad_token_id, + ) + end_time = time.perf_counter() + + # Calculate metrics + inference_time = end_time - start_time + num_tokens = len(outputs[0]) - len(inputs["input_ids"][0]) + time_per_token_val = inference_time / num_tokens if num_tokens > 0 else 0 + + prompt_times.append(inference_time) + prompt_tokens.append(num_tokens) + prompt_time_per_token.append(time_per_token_val) + + # Calculate averages for this prompt + avg_time = sum(prompt_times) / len(prompt_times) + avg_tokens = sum(prompt_tokens) / len(prompt_tokens) + avg_time_per_token = sum(prompt_time_per_token) / len(prompt_time_per_token) + + sample_result = { + "inference_time": avg_time, + "generated_tokens": avg_tokens, + "time_per_token": avg_time_per_token, + "individual_runs": [ + {"inference_time": t, "generated_tokens": tok, "time_per_token": tpt} + for t, tok, tpt in zip(prompt_times, prompt_tokens, prompt_time_per_token) + ], + } + category_samples.append(sample_result) + + category_times.append(avg_time) + category_tokens.append(avg_tokens) + category_time_per_token.append(avg_time_per_token) + + if category_times: + avg_category_time = sum(category_times) / len(category_times) + avg_category_tokens = sum(category_tokens) / len(category_tokens) + avg_category_time_per_token = sum(category_time_per_token) / len(category_time_per_token) + + inference_times[category] = avg_category_time + generated_tokens[category] = avg_category_tokens + time_per_token[category] = avg_category_time_per_token + individual_samples[category] = category_samples + + return { + "inference_times": inference_times, + "time_per_token": time_per_token, + "generated_tokens": generated_tokens, + "individual_samples": individual_samples, + } + + +def run_benchmark( + benchmark_config: BenchmarkConfig, experiment_name: str, experiment_path: str, print_fn=print +) -> BenchmarkResult: + """Run benchmarks for the specified PEFT method configuration.""" + result = BenchmarkResult( + experiment_name=experiment_name, + status=BenchmarkStatus.RUNNING, + model_id=benchmark_config.model_id, + ) + + result.save() + + start_time = time.perf_counter() + e_main_benchmark: Optional[Exception] = None + + try: + print_fn("Initializing accelerator...") + accelerator_allocated_init, accelerator_reserved_init = init_accelerator() + set_seed(benchmark_config.seed) + + print_fn(f"Loading base model: {benchmark_config.model_id}") + tokenizer = AutoTokenizer.from_pretrained(benchmark_config.model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model_kwargs = { + "device_map": "auto" if (torch.cuda.is_available() or torch.xpu.is_available()) else None, + } + + if benchmark_config.dtype == "float32": + model_kwargs["torch_dtype"] = torch.float32 + elif benchmark_config.dtype == "float16": + model_kwargs["torch_dtype"] = torch.float16 + elif benchmark_config.dtype == "bfloat16": + model_kwargs["torch_dtype"] = torch.bfloat16 + else: + raise ValueError(f"Unsupported dtype: {benchmark_config.dtype}") + + if benchmark_config.use_8bit: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True + ) + elif benchmark_config.use_4bit: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=model_kwargs.get("torch_dtype", torch.float16), + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + base_model = AutoModelForCausalLM.from_pretrained(benchmark_config.model_id, **model_kwargs) + + base_results = load_base_results(benchmark_config.model_id) + + print_fn("Preparing benchmark prompts...") + prompts = prepare_benchmark_prompts( + config=benchmark_config, + tokenizer=tokenizer, + max_input_length=None, + seed=benchmark_config.seed, + ) + + if base_results: + print_fn("Using cached base model results...") + base_inference_times = base_results["inference_results"] + else: + raise FileNotFoundError( + "No cached base results found. Please run `python run_base.py` first to generate base model results." + ) + + try: + print_fn(f"Loading PEFT config from {experiment_path}") + peft_config = PeftConfig.from_pretrained(experiment_path) + print_fn(f"Loaded PEFT config: {peft_config.peft_type}, with parameters: {vars(peft_config)}") + model = get_peft_model(base_model, peft_config) + except Exception as exc: + error_msg = f"Error loading PEFT config: {str(exc)}" + print_fn(error_msg) + + del base_model + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + elif torch.xpu.is_available(): + torch.xpu.empty_cache() + + ram, accelerator_allocated, accelerator_reserved = get_memory_usage() + result.add_memory_log("peft_model_loaded", ram, accelerator_allocated, accelerator_reserved) + + # Calculate PEFT model metrics + trainable_params = model.get_nb_trainable_parameters()[0] + total_params = sum(p.numel() for p in model.parameters()) + base_params = sum(p.numel() for p in model.base_model.parameters()) + dtype_bytes = 2 if benchmark_config.dtype in ["float16", "bfloat16"] else 4 + adapter_size_mb = trainable_params * dtype_bytes / (1024 * 1024) + base_model_size_mb = base_params * dtype_bytes / (1024 * 1024) + param_ratio = trainable_params / total_params if total_params > 0 else 0 + + result.update_meta_info( + param_counts={ + "base_params": base_params, + "trainable_params": trainable_params, + "total_params": total_params, + "param_ratio": param_ratio, + }, + size_info={"base_model_size_mb": base_model_size_mb, "adapter_size_mb": adapter_size_mb}, + package_info={ + "transformers-version": transformers.__version__, + "peft-version": peft.__version__, + "bitsandbytes-version": bitsandbytes.__version__ if hasattr(bitsandbytes, "__version__") else None, + }, + ) + + print_fn("Measuring PEFT model inference times...") + peft_inference_times = measure_inference_time( + model, + tokenizer, + prompts, + max_new_tokens=benchmark_config.max_new_tokens, + num_runs=benchmark_config.num_inference_runs, + print_fn=print_fn, + category_generation_params=benchmark_config.category_generation_params, + ) + + # Calculate inference overhead for each category + inference_overhead = { + k: (peft_inference_times["inference_times"][k] - base_inference_times["inference_times"][k]) + / base_inference_times["inference_times"][k] + * 100 + for k in base_inference_times["inference_times"] + } + + for category in prompts: + category_metrics = { + "inference_time": peft_inference_times["inference_times"][category], + "base_inference_time": base_inference_times["inference_times"][category], + "inference_overhead_pct": inference_overhead[category], + "time_per_token": peft_inference_times["time_per_token"][category], + "generated_tokens": peft_inference_times["generated_tokens"][category], + } + result.add_metrics_for_category( + category, category_metrics, individual_samples=peft_inference_times["individual_samples"][category] + ) + + result.update_generation_info( + memory_data={ + "peak_accelerator_memory_mb": max( + (log["accelerator_allocated_mb"] for log in result.generation_info["memory"]["memory_logs"]), default=0 + ), + "peak_ram_memory_mb": max( + (log["ram_mb"] for log in result.generation_info["memory"]["memory_logs"]), default=0 + ), + } + ) + + ram, accelerator_allocated, accelerator_reserved = get_memory_usage() + result.add_memory_log("benchmark_complete", ram, accelerator_allocated, accelerator_reserved) + + result.status = BenchmarkStatus.SUCCESS + + except Exception as exc: + print_fn(f"Benchmark failed with error: {exc}") + result.status = BenchmarkStatus.FAILED + e_main_benchmark = exc + end_time = time.perf_counter() + error_message = str(e_main_benchmark) if e_main_benchmark is not None else None + + peft_config_dict = peft_config.to_dict() if "peft_config" in locals() else None + if peft_config_dict: + for key, value in peft_config_dict.items(): + if isinstance(value, set): + peft_config_dict[key] = list(value) + + result.update_run_info( + duration=end_time - start_time, + status=result.status, + error=error_message, + peft_config=peft_config_dict, + benchmark_config=benchmark_config.to_dict(), + ) + + return result + + +def main() -> None: + """Main entry point for the benchmark runner.""" + parser = argparse.ArgumentParser(description="Run PEFT method benchmarks") + parser.add_argument("experiment_path", help="Path to experiment directory") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output") + args = parser.parse_args() + + print_fn = print if args.verbose else lambda *args, **kwargs: None + + experiment_path = args.experiment_path + allowed_root = os.path.abspath(os.path.join(os.path.dirname(__file__))) + abs_experiment_path = os.path.abspath(experiment_path) + if not abs_experiment_path.startswith(allowed_root): + print(f"Experiment path must be inside {allowed_root}, got: {abs_experiment_path}. Skipping execution.") + return 0 + if not os.path.exists(abs_experiment_path): + print(f"Experiment path not found: {abs_experiment_path}. Skipping execution.") + return 0 + experiment_path = abs_experiment_path + + experiment_name, benchmark_config = validate_experiment_path(experiment_path) + + print_fn(f"Running benchmark for experiment: {experiment_name}") + + result = run_benchmark( + benchmark_config=benchmark_config, + experiment_name=experiment_name, + experiment_path=experiment_path, + print_fn=print_fn, + ) + + log_results(experiment_name, result, print_fn=print) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/peft/method_comparison/text_generation_benchmark/run_base.py b/peft/method_comparison/text_generation_benchmark/run_base.py new file mode 100644 index 0000000000000000000000000000000000000000..1489cff08786e76473bcdcd237afd365594d9a63 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/run_base.py @@ -0,0 +1,185 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import os +import sys +import time + +import torch +from data import prepare_benchmark_prompts +from run import measure_inference_time +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed +from utils import ( + BenchmarkConfig, + get_memory_usage, + init_accelerator, +) + + +def run_base_model_benchmark(benchmark_config: BenchmarkConfig, print_fn=print) -> dict: + """Run benchmark for base model only and return results.""" + + print_fn(f"Running base model benchmark for: {benchmark_config.model_id}") + + print_fn("Initializing accelerator...") + init_accelerator() + + set_seed(benchmark_config.seed) + + print_fn(f"Loading base model: {benchmark_config.model_id}") + tokenizer = AutoTokenizer.from_pretrained(benchmark_config.model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + model_kwargs = { + "device_map": "auto" if (torch.cuda.is_available() or torch.xpu.is_available()) else None, + } + + if benchmark_config.dtype == "float32": + model_kwargs["torch_dtype"] = torch.float32 + elif benchmark_config.dtype == "float16": + model_kwargs["torch_dtype"] = torch.float16 + elif benchmark_config.dtype == "bfloat16": + model_kwargs["torch_dtype"] = torch.bfloat16 + + if benchmark_config.use_8bit: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True + ) + elif benchmark_config.use_4bit: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=model_kwargs.get("torch_dtype", torch.float16), + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + model = AutoModelForCausalLM.from_pretrained(benchmark_config.model_id, **model_kwargs) + + ram, accelerator_allocated, accelerator_reserved = get_memory_usage() + print_fn(f"Memory after model load - RAM: {ram:.2f}MB, {model.device.type.upper()}: {accelerator_allocated:.2f}MB") + + print_fn("Preparing benchmark prompts...") + prompts = prepare_benchmark_prompts( + config=benchmark_config.to_dict(), + tokenizer=tokenizer, + max_input_length=None, + seed=benchmark_config.seed, + ) + + # Measure base model inference for each prompt category + print_fn("Measuring base model inference times...") + base_inference_results = measure_inference_time( + model, + tokenizer, + prompts, + max_new_tokens=benchmark_config.max_new_tokens, + num_runs=benchmark_config.num_inference_runs, + print_fn=print_fn, + category_generation_params=benchmark_config.category_generation_params, + ) + + result = { + "model_id": benchmark_config.model_id, + "benchmark_config": benchmark_config.to_dict(), + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "inference_results": base_inference_results, + "memory_info": { + "ram_mb": ram, + "accelerator_allocated_mb": accelerator_allocated, + "accelerator_reserved_mb": accelerator_reserved, + }, + } + + return result + + +def save_base_results(result: dict, model_id: str) -> str: + """Save base model results with a filename based on model and config.""" + base_results_dir = os.path.join(os.path.dirname(__file__), "base_results") + os.makedirs(base_results_dir, exist_ok=True) + + model_name = model_id.replace("/", "_").replace("-", "_") + filename = f"base_{model_name}.json" + filepath = os.path.join(base_results_dir, filename) + + with open(filepath, "w") as f: + json.dump(result, f, indent=2) + + return filepath + + +def main(): + """Main entry point for the base model benchmark runner.""" + parser = argparse.ArgumentParser(description="Run base model benchmarks") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose output") + parser.add_argument("--force", "-f", action="store_true", help="Force re-run even if results exist") + args = parser.parse_args() + + print_fn = print if args.verbose else lambda *args, **kwargs: None + + default_config_path = os.path.join(os.path.dirname(__file__), "default_benchmark_params.json") + benchmark_config = BenchmarkConfig.from_json(default_config_path) + + model_name = benchmark_config.model_id.replace("/", "_").replace("-", "_") + base_results_dir = os.path.join(os.path.dirname(__file__), "base_results") + filename = f"base_{model_name}.json" + filepath = os.path.join(base_results_dir, filename) + + if os.path.exists(filepath) and not args.force: + print(f"Base results already exist at: {filepath}") + print("Use --force to re-run the benchmark") + return 0 + + print_fn(f"Running base model benchmark for: {benchmark_config.model_id}") + + result = run_base_model_benchmark(benchmark_config, print_fn=print_fn) + + saved_path = save_base_results(result, benchmark_config.model_id) + device_type = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" + print(f"Base model results saved to: {saved_path}") + + print("\nBase Model Benchmark Summary:") + print(f"Model: {result['model_id']}") + print( + f"Memory Usage - RAM: {result['memory_info']['ram_mb']:.2f}MB, {device_type.upper()}: {result['memory_info']['accelerator_allocated_mb']:.2f}MB" + ) + + print("\nInference Times by Category:") + for category, time_val in result["inference_results"]["inference_times"].items(): + time_per_token = result["inference_results"]["time_per_token"][category] + tokens = result["inference_results"]["generated_tokens"][category] + print(f" {category}: {time_val:.4f}s ({time_per_token:.6f}s/token, {tokens:.1f} tokens)") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/peft/method_comparison/text_generation_benchmark/temporary_results/.gitkeep b/peft/method_comparison/text_generation_benchmark/temporary_results/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/method_comparison/text_generation_benchmark/utils.py b/peft/method_comparison/text_generation_benchmark/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..99533e9ac6ada83c2ccbc1ca356a916b585aa141 --- /dev/null +++ b/peft/method_comparison/text_generation_benchmark/utils.py @@ -0,0 +1,456 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Utilities for PEFT benchmarking. +""" + +import datetime +import json +import os +import platform +import subprocess +from dataclasses import asdict, dataclass, field +from enum import Enum +from typing import Any, Callable, Optional +from peft.utils import infer_device + +import psutil +import torch + + +FILE_NAME_BENCHMARK_PARAMS = "benchmark_params.json" +FILE_NAME_DEFAULT_CONFIG = "default_benchmark_params.json" + +RESULT_PATH = os.path.join(os.path.dirname(__file__), "results") +RESULT_PATH_TEMP = os.path.join(os.path.dirname(__file__), "temporary_results") +RESULT_PATH_CANCELLED = os.path.join(os.path.dirname(__file__), "cancelled_results") + + +class BenchmarkStatus(Enum): + """Status of a benchmark run.""" + + SUCCESS = "success" + FAILED = "failed" + CANCELLED = "cancelled" + RUNNING = "running" + + +@dataclass +class BenchmarkResult: + """Container for benchmark results.""" + + experiment_name: str + status: BenchmarkStatus + + model_id: str + + run_info: dict = field(default_factory=dict) + generation_info: dict = field(default_factory=dict) + meta_info: dict = field(default_factory=dict) + + def __post_init__(self): + """Initialize structured data format.""" + device = infer_device() + torch_accelerator_module = getattr(torch, device, torch.cuda) + self.run_info = { + "timestamp": datetime.datetime.now(tz=datetime.timezone.utc).isoformat(), + "duration": 0.0, + "status": self.status.value, + "hardware": { + "num_accelerators": torch_accelerator_module.device_count() if torch_accelerator_module.is_available() else 0, + "accelerator_type": torch_accelerator_module.get_device_name(0) if torch_accelerator_module.is_available() else "N/A", + "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A", + "pytorch_version": torch.__version__, + }, + } + + self.meta_info = { + "model_id": self.model_id, + "parameters": { + "base_params": 0, + "trainable_params": 0, + "total_params": 0, + "param_ratio": 0.0, + }, + "model_size": { + "base_model_size_mb": 0.0, + "adapter_size_mb": 0.0, + }, + "package_info": { + "transformers-version": None, + "transformers-commit-hash": None, + "peft-version": None, + "peft-commit-hash": None, + "datasets-version": None, + "datasets-commit-hash": None, + "bitsandbytes-version": None, + "bitsandbytes-commit-hash": None, + "torch-version": torch.__version__, + "torch-commit-hash": None, + }, + "system_info": { + "system": platform.system(), + "release": platform.release(), + "version": platform.version(), + "machine": platform.machine(), + "processor": platform.processor(), + "accelerator": torch_accelerator_module.get_device_name(0) if torch_accelerator_module.is_available() else "N/A", + }, + } + + self.generation_info = { + "memory": { + "peak_accelerator_memory_mb": 0.0, + "peak_ram_memory_mb": 0.0, + "memory_logs": [], + }, + "by_category": {}, + "overall": {}, + } + + def update_meta_info(self, param_counts: dict, size_info: dict, package_info: Optional[dict] = None): + """Update model metadata information.""" + self.meta_info["parameters"].update(param_counts) + self.meta_info["model_size"].update(size_info) + if package_info: + self.meta_info["package_info"].update(package_info) + + def update_generation_info(self, memory_data: Optional[dict] = None, performance_metrics: Optional[dict] = None): + """Update generation performance information, primarily for memory and high-level performance.""" + if memory_data: + self.generation_info["memory"].update(memory_data) + if performance_metrics: # For things like overall tokens/sec if calculated + self.generation_info.update(performance_metrics) + + def add_memory_log(self, stage: str, ram_mb: float, accelerator_allocated_mb: float, accelerator_reserved_mb: float): + """Add a memory usage log entry to generation_info.""" + self.generation_info["memory"]["memory_logs"].append( + { + "stage": stage, + "ram_mb": ram_mb, + "accelerator_allocated_mb": accelerator_allocated_mb, + "accelerator_reserved_mb": accelerator_reserved_mb, + } + ) + + def add_metrics_for_category(self, category: str, metrics: dict, individual_samples: list = None): + """Add metrics for a specific prompt category under generation_info.""" + category_data = {"metrics": metrics, "samples": individual_samples if individual_samples is not None else []} + self.generation_info["by_category"][category] = category_data + + def update_run_info( + self, + duration: float, + status: BenchmarkStatus, + error: Optional[str] = None, + peft_config: Optional[dict] = None, + benchmark_config: Optional[dict] = None, + ): + """Update run information.""" + self.run_info["duration"] = duration + self.run_info["status"] = status.value + if error: + self.run_info["error"] = error + if peft_config: + self.run_info["peft_config"] = peft_config + if benchmark_config: + self.run_info["benchmark_config"] = benchmark_config + + def compute_overall_metrics(self): + """Compute overall metrics across all categories within generation_info.""" + if not self.generation_info["by_category"]: + return + + categories = self.generation_info["by_category"] + key_metrics = [ + "inference_time", + "base_inference_time", + "inference_overhead_pct", + "time_per_token", + "generated_tokens", + ] + + for metric in key_metrics: + values = [] + for category_data in categories.values(): + if "metrics" in category_data and metric in category_data["metrics"]: + values.append(category_data["metrics"][metric]) + + if values: + self.generation_info["overall"][metric] = sum(values) / len(values) + + def to_dict(self) -> dict[str, Any]: + """Convert result to dictionary.""" + self.compute_overall_metrics() + return { + "run_info": self.run_info, + "generation_info": self.generation_info, + "meta_info": self.meta_info, + } + + def save(self, path: Optional[str] = None): + """Save result to JSON file.""" + if path is None: + peft_branch = get_peft_branch() + if self.status == BenchmarkStatus.CANCELLED: + base_path = RESULT_PATH_CANCELLED + elif peft_branch != "main": + base_path = RESULT_PATH_TEMP + elif self.status == BenchmarkStatus.SUCCESS: + base_path = RESULT_PATH + elif self.status == BenchmarkStatus.FAILED: + base_path = RESULT_PATH_CANCELLED + else: + base_path = RESULT_PATH_TEMP + + filename = f"{self.experiment_name}.json" + path = os.path.join(base_path, filename) + + os.makedirs(os.path.dirname(path), exist_ok=True) + + with open(path, "w") as f: + json.dump(self.to_dict(), f, indent=2) + + return path + + +@dataclass +class BenchmarkConfig: + """Configuration for benchmarking PEFT methods.""" + + model_id: str + + seed: int + num_inference_runs: int + max_new_tokens: int + + dtype: str = "float16" + use_4bit: bool = False + use_8bit: bool = False + + category_generation_params: Optional[dict] = None + + def __post_init__(self) -> None: + """Validate configuration.""" + if not isinstance(self.model_id, str): + raise ValueError(f"Invalid model_id: {self.model_id}") + + if self.seed < 0: + raise ValueError(f"Invalid seed: {self.seed}") + + if self.num_inference_runs <= 0: + raise ValueError(f"Invalid num_inference_runs: {self.num_inference_runs}") + + if self.max_new_tokens <= 0: + raise ValueError(f"Invalid max_new_tokens: {self.max_new_tokens}") + + @classmethod + def from_dict(cls, config_dict: dict) -> "BenchmarkConfig": + """Create config from dictionary.""" + valid_keys = set(cls.__dataclass_fields__.keys()) + filtered_dict = {k: v for k, v in config_dict.items() if k in valid_keys} + + return cls(**filtered_dict) + + @classmethod + def from_json(cls, json_path: str) -> "BenchmarkConfig": + """Load config from JSON file.""" + with open(json_path) as f: + config_dict = json.load(f) + return cls.from_dict(config_dict) + + def to_dict(self) -> dict[str, Any]: + """Convert config to dictionary.""" + result = asdict(self) + return result + + def save(self, path: str) -> None: + """Save config to JSON file.""" + with open(path, "w") as f: + json.dump(self.to_dict(), f, indent=2) + + def merge_from_dict(self, config_dict: dict) -> None: + """Merge settings from a dictionary into this config object. + Keys in config_dict will override existing attributes. + """ + for key, value in config_dict.items(): + if hasattr(self, key): + setattr(self, key, value) + + +def validate_experiment_path(path: str) -> tuple[str, "BenchmarkConfig"]: + """Validate experiment path, load and merge configs, and return them.""" + if not os.path.exists(path): + raise FileNotFoundError(f"Experiment path not found: {path}") + + path_parts = os.path.normpath(path).split(os.sep) + + try: + experiments_idx = path_parts.index("experiments") + except ValueError: + experiment_name = os.path.basename(path.rstrip(os.sep)) + else: + if experiments_idx + 1 < len(path_parts): + method_name = path_parts[experiments_idx + 1] + remaining_parts = path_parts[experiments_idx + 2 :] + if remaining_parts: + remaining_name = "-".join(remaining_parts) + experiment_name = f"{method_name}--{remaining_name}" + else: + experiment_name = method_name + else: + experiment_name = os.path.basename(path.rstrip(os.sep)) + + default_config_path = os.path.join(os.path.dirname(__file__), FILE_NAME_DEFAULT_CONFIG) + experiment_benchmark_params_path = os.path.join(path, FILE_NAME_BENCHMARK_PARAMS) + + if not os.path.exists(default_config_path): + raise FileNotFoundError(f"Default configuration file not found: {default_config_path}. This is required.") + benchmark_config = BenchmarkConfig.from_json(default_config_path) + print(f"Loaded default configuration from {default_config_path}") + + if os.path.exists(experiment_benchmark_params_path): + with open(experiment_benchmark_params_path) as f: + experiment_specific_params = json.load(f) + + benchmark_config.merge_from_dict(experiment_specific_params) + print(f"Loaded and merged experiment-specific parameters from {experiment_benchmark_params_path}") + else: + print(f"No {FILE_NAME_BENCHMARK_PARAMS} found in {path}. Using only default configuration.") + + return experiment_name, benchmark_config + + +def get_memory_usage() -> tuple[float, float, float]: + """Get current memory usage (RAM and accelerator).""" + process = psutil.Process(os.getpid()) + ram_usage_bytes = process.memory_info().rss + ram_usage_mb = ram_usage_bytes / (1024 * 1024) + + if torch.cuda.is_available(): + accelerator_allocated = torch.cuda.memory_allocated() + accelerator_reserved = torch.cuda.memory_reserved() + accelerator_allocated_mb = accelerator_allocated / (1024 * 1024) + accelerator_reserved_mb = accelerator_reserved / (1024 * 1024) + elif torch.xpu.is_available(): + accelerator_allocated = torch.xpu.memory_allocated() + accelerator_reserved = torch.xpu.memory_reserved() + accelerator_allocated_mb = accelerator_allocated / (1024 * 1024) + accelerator_reserved_mb = accelerator_reserved / (1024 * 1024) + else: + accelerator_allocated_mb = 0.0 + accelerator_reserved_mb = 0.0 + + return ram_usage_mb, accelerator_allocated_mb, accelerator_reserved_mb + + +def init_accelerator() -> tuple[float, float]: + """Initialize accelerator and return initial memory usage.""" + if torch.cuda.is_available(): + torch.cuda.init() + torch.cuda.empty_cache() + _, accelerator_allocated, accelerator_reserved = get_memory_usage() + elif torch.xpu.is_available(): + torch.xpu.init() + torch.xpu.empty_cache() + _, accelerator_allocated, accelerator_reserved = get_memory_usage() + else: + accelerator_allocated = 0.0 + accelerator_reserved = 0.0 + return accelerator_allocated, accelerator_reserved + + +def get_model_size_mb(model: torch.nn.Module, dtype_bytes: int = 4) -> float: + """Calculate model size in MB.""" + return sum(p.numel() * dtype_bytes for p in model.parameters()) / (1024 * 1024) + + +def get_peft_branch() -> str: + repo_root = os.path.dirname(__file__) + return subprocess.check_output("git rev-parse --abbrev-ref HEAD".split(), cwd=repo_root).decode().strip() + + +def log_results( + experiment_name: str, + benchmark_result: BenchmarkResult, + print_fn: Callable = print, +) -> None: + """Log benchmark results to console.""" + print_fn("\n" + "=" * 50) + print_fn(f"Benchmark Results: {experiment_name}") + print_fn("=" * 50) + + print_fn(f"Status: {benchmark_result.run_info.get('status', 'N/A')}") + print_fn(f"Duration: {benchmark_result.run_info.get('duration', 0):.2f} seconds") + + if benchmark_result.run_info.get("status") != BenchmarkStatus.SUCCESS.value: + print_fn(f"Error: {benchmark_result.run_info.get('error', 'Unknown error')}") + print_fn("=" * 50) + return + + print_fn("\nModel Information:") + print_fn(f" Base Model: {benchmark_result.meta_info.get('model_id', 'N/A')}") + + print_fn("\nParameter Counts:") + params = benchmark_result.meta_info.get("parameters", {}) + print_fn(f" Base Parameters: {params.get('base_params', 0):,}") + print_fn(f" Trainable Parameters: {params.get('trainable_params', 0):,}") + print_fn(f" Parameter Ratio: {params.get('param_ratio', 0):.5%}") + + print_fn("\nModel Size:") + size_info = benchmark_result.meta_info.get("model_size", {}) + print_fn(f" Base Model: {size_info.get('base_model_size_mb', 0):.2f} MB") + print_fn(f" Adapter: {size_info.get('adapter_size_mb', 0):.2f} MB") + + print_fn("\nMemory Usage (from generation_info):") + memory_data = benchmark_result.generation_info.get("memory", {}) + print_fn(f" Peak Accelerator Memory: {memory_data.get('peak_accelerator_memory_mb', 0):.2f} MB") + print_fn(f" Peak RAM Memory: {memory_data.get('peak_ram_memory_mb', 0):.2f} MB") + + print_fn("\nDetailed Metrics (from generation_info.by_category):") + if benchmark_result.generation_info.get("by_category"): + for category, cat_data in benchmark_result.generation_info["by_category"].items(): + print_fn(f" Category: {category}") + metrics = cat_data.get("metrics", {}) + print_fn(f" Inference Time: {metrics.get('inference_time', 0):.4f} seconds") + print_fn(f" Base Inference Time: {metrics.get('base_inference_time', 0):.4f} seconds") + print_fn(f" Inference Overhead: {metrics.get('inference_overhead_pct', 0):.2f}%") + print_fn(f" Time Per Token: {metrics.get('time_per_token', 0):.6f} seconds/token") + print_fn(f" Generated Tokens: {metrics.get('generated_tokens', 0):.1f}") + + samples = cat_data.get("samples", []) + if samples: + print_fn(f" Number of Samples: {len(samples)}") + print_fn( + f" Average Generated Tokens: {sum(s.get('generated_tokens', 0) for s in samples) / len(samples):.1f}" + ) + else: + print_fn(" No per-category metrics available.") + + benchmark_result.compute_overall_metrics() + + print_fn("\nOverall Metrics (from generation_info.overall):") + overall = benchmark_result.generation_info.get("overall") + if overall: + print_fn(f" Inference Time: {overall.get('inference_time', 0):.4f} seconds") + print_fn(f" Base Inference Time: {overall.get('base_inference_time', 0):.4f} seconds") + print_fn(f" Inference Overhead: {overall.get('inference_overhead_pct', 0):.2f}%") + print_fn(f" Time Per Token: {overall.get('time_per_token', 0):.6f} seconds/token") + print_fn(f" Generated Tokens: {overall.get('generated_tokens', 0):.1f}") + else: + print_fn(" No overall metrics computed.") + + print_fn("\nSaved results to:", benchmark_result.save()) + print_fn("=" * 50) diff --git a/peft/pyproject.toml b/peft/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..76c7294107b8c59bd4705ea77b1ea674f2ec012a --- /dev/null +++ b/peft/pyproject.toml @@ -0,0 +1,56 @@ +[tool.black] +# Only used by `hf-doc-builder´. +line-length = 119 +target-version = ['py38'] + +[tool.ruff] +target-version = "py39" +line-length = 119 +extend-exclude = ["*.ipynb"] + +[tool.ruff.lint] +preview = true +explicit-preview-rules = true +extend-select = [ + "C", # Complexity + "E", # PEP8 errors + "F", # PEP8 formatting + "I", # Import sorting + "UP", # Pyupgrade upgrades + "W", # PEP8 warnings + "PT009", # Pytest assertions + "RUF022", # Sorting of __all__ +] +ignore = [ + "C901", # Function too complex + "E501", # Line length (handled by ruff-format) + "F841", # unused variable + "UP007", # X | Y style Unions + "C420", # dict.fromkeys + "UP045", # don't force replacing Optional[X] with X | None +] + +[tool.ruff.lint.isort] +lines-after-imports = 2 +known-first-party = ["peft"] + +[tool.pytest] +doctest_optionflags = [ + "NORMALIZE_WHITESPACE", + "ELLIPSIS", + "NUMBER", +] + +[tool.pytest.ini_options] +addopts = "--cov=src/peft --cov-report=term-missing --durations=10" +markers = [ + "single_gpu_tests: tests that run on a single GPU", + "multi_gpu_tests: tests that run on multiple GPUs", + "regression: whether to run regression suite test", + "bitsandbytes: select bitsandbytes integration tests" +] + +filterwarnings = [ + "error::DeprecationWarning:transformers", + "error::FutureWarning:transformers", +] diff --git a/peft/requirements.txt b/peft/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dca857de3249b60ea3786b49156d14166cd57ac0 --- /dev/null +++ b/peft/requirements.txt @@ -0,0 +1,15 @@ +accelerate +torch +safetensors +bitsandbytes +scipy +peft +transformers +tqdm +packaging +pytest +numpy +pyyaml +datasets +psutil +setuptools \ No newline at end of file diff --git a/peft/scripts/ci_clean_cache.py b/peft/scripts/ci_clean_cache.py new file mode 100644 index 0000000000000000000000000000000000000000..8e4bfbaa0abc8fee26098d1b6496d3891ac7bc3f --- /dev/null +++ b/peft/scripts/ci_clean_cache.py @@ -0,0 +1,67 @@ +""" +Utility to clean cache files that exceed a specific time in days according to their +last access time recorded in the cache. + +Exit code: +- 1 if no candidates are found +- 0 if candidates are found + +Deletion can be enabled by passing `-d` parameter, otherwise it will only list the candidates. +""" + +import sys +from datetime import datetime as dt + +from huggingface_hub import scan_cache_dir + + +def find_old_revisions(scan_results, max_age_days=30): + """Find commit hashes of objects in the cache. These objects need a last access time that + is above the passed `max_age_days` parameter. Returns an empty list if no objects are found. + Time measurement is based of the current time and the recorded last access tiem in the cache. + """ + now = dt.now() + revisions = [(i.revisions, i.last_accessed) for i in scan_results.repos] + revisions_ages = [(rev, (now - dt.fromtimestamp(ts_access)).days) for rev, ts_access in revisions] + delete_candidates = [rev for rev, age in revisions_ages if age > max_age_days] + hashes = [n.commit_hash for rev in delete_candidates for n in rev] + + return hashes + + +def delete_old_revisions(scan_results, delete_candidates, do_delete=False): + delete_operation = scan_results.delete_revisions(*delete_candidates) + print(f"Would free {delete_operation.expected_freed_size_str}") + print(f"Candidates: {delete_candidates}") + + if do_delete: + print("Deleting now.") + delete_operation.execute() + else: + print("Not deleting, pass the -d flag.") + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser() + parser.add_argument("-a", "--max-age", type=int, default=30, help="Max. age in days items in the cache may have.") + parser.add_argument( + "-d", + "--delete", + action="store_true", + help=( + "Delete mode; Really delete items if there are candidates. Exit code = 0 when we found something to delete, 1 " + "otherwise." + ), + ) + args = parser.parse_args() + + scan_results = scan_cache_dir() + + delete_candidates = find_old_revisions(scan_results, args.max_age) + if not delete_candidates: + print("No delete candidates found, not deleting anything.") + sys.exit(1) + + delete_old_revisions(scan_results, delete_candidates, do_delete=args.delete) diff --git a/peft/scripts/convert-bone-to-miss.py b/peft/scripts/convert-bone-to-miss.py new file mode 100644 index 0000000000000000000000000000000000000000..e709410fa806bb0fa35e6f01160084c9dc2d4652 --- /dev/null +++ b/peft/scripts/convert-bone-to-miss.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Your Organization/Project. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert Bone checkpoint to MiSS format.""" + +import argparse +import json +import os +from pathlib import Path + +from safetensors import safe_open +from safetensors.torch import save_file + +from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME + + +def convert_bone_to_miss(bone_dir: Path, miss_dir: Path) -> None: + """Convert Bone checkpoint files to MiSS format.""" + bone_config_path = bone_dir / CONFIG_NAME + miss_config_path = miss_dir / CONFIG_NAME + if not os.path.exists(miss_dir): + os.makedirs(miss_dir, exist_ok=True) + with open(bone_config_path, encoding="utf-8") as f: + config = json.load(f) + + config["peft_type"] = "MISS" + + with open(miss_config_path, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2, ensure_ascii=False) + + bone_weight_path = bone_dir / SAFETENSORS_WEIGHTS_NAME + miss_weight_path = miss_dir / SAFETENSORS_WEIGHTS_NAME + + new_data = {} + + with safe_open(bone_weight_path, framework="pt") as f: + for old_key in f.keys(): + tensor = f.get_tensor(old_key) + new_key = old_key.replace(".bone_", ".miss_") + new_data[new_key] = tensor + + save_file(new_data, miss_weight_path) + + print(f"Converted checkpoint saved at {miss_weight_path}") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Convert Bone checkpoint to MiSS format.") + parser.add_argument("bone_dir", type=Path, help="Directory containing Bone checkpoint files") + parser.add_argument("miss_dir", type=Path, help="Directory to save MiSS checkpoint files") + args = parser.parse_args() + + args.miss_dir.mkdir(parents=True, exist_ok=True) + convert_bone_to_miss(args.bone_dir, args.miss_dir) + + +if __name__ == "__main__": + main() diff --git a/peft/scripts/launch_notebook_mp.py b/peft/scripts/launch_notebook_mp.py new file mode 100644 index 0000000000000000000000000000000000000000..ce5439afa82a4220cdecbd73e545c58cd14f8442 --- /dev/null +++ b/peft/scripts/launch_notebook_mp.py @@ -0,0 +1,47 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is a minimal example of launching PEFT with Accelerate. This used to cause issues because PEFT would eagerly +# import bitsandbytes, which initializes CUDA, resulting in: +# > RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the +# > 'spawn' start method +# This script exists to ensure that this issue does not reoccur. + +import torch +from accelerate import notebook_launcher + +import peft +from peft.utils import infer_device + + +def init(): + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(1, 2) + + def forward(self, x): + return self.linear(x) + + device = infer_device() + model = MyModule().to(device) + peft.get_peft_model(model, peft.LoraConfig(target_modules=["linear"])) + + +def main(): + notebook_launcher(init, (), num_processes=2) + + +if __name__ == "__main__": + main() diff --git a/peft/scripts/log_reports.py b/peft/scripts/log_reports.py new file mode 100644 index 0000000000000000000000000000000000000000..c8191ee8f5a2d79c83b266e4a60d2f280fa9d376 --- /dev/null +++ b/peft/scripts/log_reports.py @@ -0,0 +1,144 @@ +import argparse +import json +import os +from datetime import date +from pathlib import Path + +from tabulate import tabulate + + +MAX_LEN_MESSAGE = 2900 # slack endpoint has a limit of 3001 characters + +parser = argparse.ArgumentParser() +parser.add_argument( + "--slack_channel_name", + default="peft-ci-daily", +) + + +def main(slack_channel_name=None): + failed = [] + passed = [] + + group_info = [] + + total_num_failed = 0 + empty_file = False or len(list(Path().glob("*.log"))) == 0 + + total_empty_files = [] + + for log in Path().glob("*.log"): + section_num_failed = 0 + i = 0 + with open(log) as f: + for line in f: + line = json.loads(line) + i += 1 + if line.get("nodeid", "") != "": + test = line["nodeid"] + if line.get("duration", None) is not None: + duration = f"{line['duration']:.4f}" + if line.get("outcome", "") == "failed": + section_num_failed += 1 + failed.append([test, duration, log.name.split("_")[0]]) + total_num_failed += 1 + else: + passed.append([test, duration, log.name.split("_")[0]]) + empty_file = i == 0 + group_info.append([str(log), section_num_failed, failed]) + total_empty_files.append(empty_file) + os.remove(log) + failed = [] + text = ( + "🌞 There were no failures!" + if not any(total_empty_files) + else "Something went wrong there is at least one empty file - please check GH action results." + ) + no_error_payload = { + "type": "section", + "text": { + "type": "plain_text", + "text": text, + "emoji": True, + }, + } + + message = "" + payload = [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "🤗 Results of the {} PEFT scheduled tests.".format(os.environ.get("TEST_TYPE", "")), + }, + }, + ] + if total_num_failed > 0: + for i, (name, num_failed, failed_tests) in enumerate(group_info): + if num_failed > 0: + if num_failed == 1: + message += f"*{name}: {num_failed} failed test*\n" + else: + message += f"*{name}: {num_failed} failed tests*\n" + failed_table = [] + for test in failed_tests: + failed_table.append(test[0].split("::")) + failed_table = tabulate( + failed_table, + headers=["Test Location", "Test Case", "Test Name"], + showindex="always", + tablefmt="grid", + maxcolwidths=[12, 12, 12], + ) + message += "\n```\n" + failed_table + "\n```" + + if total_empty_files[i]: + message += f"\n*{name}: Warning! Empty file - please check the GitHub action job *\n" + print(f"### {message}") + else: + payload.append(no_error_payload) + + if os.environ.get("TEST_TYPE", "") != "": + from slack_sdk import WebClient + + if len(message) > MAX_LEN_MESSAGE: + print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}") + message = message[:MAX_LEN_MESSAGE] + "..." + + if len(message) != 0: + md_report = { + "type": "section", + "text": {"type": "mrkdwn", "text": message}, + } + payload.append(md_report) + action_button = { + "type": "section", + "text": {"type": "mrkdwn", "text": "*For more details:*"}, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, + "url": f"https://github.com/huggingface/peft/actions/runs/{os.environ['GITHUB_RUN_ID']}", + }, + } + payload.append(action_button) + + date_report = { + "type": "context", + "elements": [ + { + "type": "plain_text", + "text": f"Nightly {os.environ.get('TEST_TYPE')} test results for {date.today()}", + }, + ], + } + payload.append(date_report) + + print(payload) + + client = WebClient(token=os.environ.get("SLACK_API_TOKEN")) + client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload) + + +if __name__ == "__main__": + args = parser.parse_args() + main(args.slack_channel_name) diff --git a/peft/scripts/stale.py b/peft/scripts/stale.py new file mode 100644 index 0000000000000000000000000000000000000000..794ec8451282c69ae9cff18c15329b14816d707a --- /dev/null +++ b/peft/scripts/stale.py @@ -0,0 +1,65 @@ +# Copyright 2023 The HuggingFace Team, the AllenNLP library authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Script to close stale issue. Taken in part from the AllenNLP repository. +https://github.com/allenai/allennlp. +""" + +import os +from datetime import datetime as dt +from datetime import timezone + +from github import Github + + +LABELS_TO_EXEMPT = [ + "good first issue", + "good second issue", + "good difficult issue", + "feature request", + "new model", + "wip", + "PRs welcome to address this", +] + + +def main(): + g = Github(os.environ["GITHUB_TOKEN"]) + repo = g.get_repo("huggingface/peft") + open_issues = repo.get_issues(state="open") + + for issue in open_issues: + comments = sorted(issue.get_comments(), key=lambda i: i.created_at, reverse=True) + last_comment = comments[0] if len(comments) > 0 else None + if ( + (last_comment is not None and last_comment.user.login == "github-actions[bot]") + and (dt.now(timezone.utc) - issue.updated_at).days > 7 + and (dt.now(timezone.utc) - issue.created_at).days >= 30 + and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) + ): + issue.edit(state="closed") + elif ( + (dt.now(timezone.utc) - issue.updated_at).days > 23 + and (dt.now(timezone.utc) - issue.created_at).days >= 30 + and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels()) + ): + issue.create_comment( + "This issue has been automatically marked as stale because it has not had " + "recent activity. If you think this still needs to be addressed " + "please comment on this thread.\n\n" + ) + + +if __name__ == "__main__": + main() diff --git a/peft/scripts/train_memory.py b/peft/scripts/train_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..c764fe84ae095ee65b923b517fc438b412ad10ed --- /dev/null +++ b/peft/scripts/train_memory.py @@ -0,0 +1,276 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This script trains a model on a small text dataset and measures the memory consumption, as well as a few other +useful metrics. + +Example: + +Get help: + +```bash +python train_memory.py --help +``` + +Train the google/gemma-2-2b model with a LoRA config json at the indicated location. + +```bash +python train_memory.py "google/gemma-2-2b" --max_seq_length 256 --batch_size 1 --rank 32 --dtype bfloat16 --path_config +``` + +Fully fine-tune the model (i.e. without LoRA) by setting the rank to 0: + +```bash +python train_memory.py "google/gemma-2-2b" --rank 0 +``` + +Get an estimate of the size of the hidden states by passing `--monitor_tensors`. This trains just for a single epoch. For realistic estimates, the batch size for this: + +```bash +python train_memory.py "google/gemma-2-2b" --max_seq_length 256 --batch_size 32 --rank 32 --dtype bfloat16 --path_config configs/lora_rank-32_embedding-lora/ --monitor_tensors +``` + +""" + +import argparse +import gc +import os +import sys +import tempfile +import time +import warnings +from collections import Counter +from contextlib import nullcontext +from functools import partial + +import torch +from datasets import load_dataset +from torch import nn +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME + + +# suppress all warnings +warnings.filterwarnings("ignore") + +device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda" +dtype_to_bytes_linear = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5} + + +def init_accelerator(): + torch.manual_seed(0) + if device == "cpu": + return + + device_module = getattr(torch, device, torch.cuda) + device_module.reset_peak_memory_stats() + device_module.manual_seed_all(0) + # might not be necessary, but just to be sure + nn.Linear(1, 1).to(device) + + +def get_data(tokenizer): + def tokenize(samples): + # For some reason, the max sequence length is not honored by the tokenizer, resulting in IndexErrors. Thus, + # manually ensure that sequences are not too long. + tokenized = tokenizer(samples["quote"]) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized + + data = load_dataset("ybelkada/english_quotes_copy") + data = data.map(tokenize, batched=True) + # We need to manually remove unused columns. This is because we cannot use remove_unused_columns=True in the + # Trainer, as this leads to errors with torch.compile. We also cannot just leave them in, as they contain + # strings. Therefore, manually remove all unused columns. + data = data.remove_columns(["quote", "author", "tags"]) + return data + + +def train(model_id, rank, dtype, monitor_tensors, max_seq_length, batch_size, max_steps, path_config): + init_accelerator() + device_module = getattr(torch, device, torch.cuda) + accelerator_memory_init = device_module.max_memory_allocated() + accelerator_memory_log = [] + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.model_max_length = max_seq_length + if not tokenizer.pad_token: + tokenizer.pad_token = tokenizer.eos_token + data = get_data(tokenizer) + + if dtype == "int4": + quant_config = BitsAndBytesConfig(load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, quantization_config=quant_config) + model = prepare_model_for_kbit_training(model) + elif dtype == "int8": + quant_config = BitsAndBytesConfig(load_in_8bit=True) + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, quantization_config=quant_config) + model = prepare_model_for_kbit_training(model) + elif dtype == "bfloat16": + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.bfloat16) + elif dtype == "float16": + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, torch_dtype=torch.float16) + elif dtype == "float32": + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device) + else: + raise ValueError(f"Invalid dtype: {dtype}") + + if rank > 0: + if path_config is None: + raise RuntimeError("LoRA rank > 0 requires a path to a LoRA config") + if path_config.endswith(CONFIG_NAME): + path_config = path_config.removesuffix(CONFIG_NAME) + config = LoraConfig.from_pretrained(path_config) + model = get_peft_model(model, config) + model.print_trainable_parameters() + else: + print("Not using LoRA") + + model.config.use_cache = False + storage = [] + + def pack(x): + storage.append(x) + return len(storage) - 1 + + def unpack(x): + return storage[x] + + train_ctx = partial(torch.autograd.graph.saved_tensors_hooks, pack, unpack) if monitor_tensors else nullcontext + + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) + losses = [] + sample = 0 + tic_total = time.perf_counter() + for i in range(0, max_steps): + storage.clear() + tic = time.perf_counter() + try: + batch = tokenizer.pad(data["train"][sample : sample + batch_size], return_tensors="pt").to(model.device) + sample += batch_size + + # add targets + batch["labels"] = batch["input_ids"].clone() + optimizer.zero_grad() + + with train_ctx(): + outputs = model(**batch) + loss = outputs.loss + loss.backward() + optimizer.step() + losses.append(loss.item()) + accelerator_memory_log.append(device_module.memory_allocated() - accelerator_memory_init) + device_module.empty_cache() + gc.collect() + toc = time.perf_counter() + print(f"step {i:3d} loss {loss.item():.6f} time {toc - tic:.2f}s", file=sys.stderr) + except KeyboardInterrupt: + print("canceled training") + break + + if monitor_tensors: + break + + toc_total = time.perf_counter() + + accelerator_memory_final = device_module.max_memory_allocated() + accelerator_memory_avg = int(sum(accelerator_memory_log) / len(accelerator_memory_log)) + print(f"{model.device.type} memory avg: {accelerator_memory_avg // 2**20}MB") + print(f"{model.device.type} memory max: {(accelerator_memory_final - accelerator_memory_init) // 2**20}MB") + print(f"total time: {toc_total - tic_total:.2f}s") + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME)) + file_size = stat.st_size + print(f"file size: {file_size / 2**20:.1f}MB") + + if monitor_tensors: + dtype_counts = Counter(t.dtype for t in storage) + shape_counts = Counter(t.shape for t in storage) + param_shape_counts = Counter(p.shape for p in model.parameters()) + param_shape_counts_copy = dict(param_shape_counts).copy() + + # shape counts includes the params, so we need to subtract them; note that they can be transposed + # this is an approximation + diff_shape_counts = {} + for shape, count in shape_counts.items(): + if shape in param_shape_counts_copy: + diff_count = count - param_shape_counts[shape] + if diff_count > 0: + diff_shape_counts[shape] = diff_count + param_shape_counts_copy[shape] = max(0, param_shape_counts_copy[shape] - diff_count) + elif shape[::-1] in param_shape_counts: + diff_count = count - param_shape_counts[shape[::-1]] + if diff_count > 0: + diff_shape_counts[shape] = diff_count + param_shape_counts_copy[shape[::-1]] = max(0, param_shape_counts_copy[shape[::-1]] - diff_count) + else: + diff_shape_counts[shape] = count + + total_size = sum(t.numel() * t.element_size() for t in storage) + total_size_mb = f"{total_size // 2**20}MB" + diff_size = 0 + for shape, count in diff_shape_counts.items(): + diff_size += count * torch.zeros(shape).numel() * dtype_to_bytes_linear[dtype] + param_size = total_size - diff_size + + diff_size_mb = f"{diff_size // 2**20}MB" + param_size_mb = f"{param_size // 2**20}MB" + + print(f"Dtype counts: {dtype_counts.most_common()}") + print(f"Total size of tensors: {total_size_mb: >12}") + print(f"Total size of activations: {diff_size_mb: >12}") + print(f"Total size of parameters: {param_size_mb: >12}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("model_id", type=str, help="Model name on Hugging Face Hub") + parser.add_argument("--rank", type=int, default=8, help="Rank of LoRA, 0 => no LoRA, default 8") + parser.add_argument( + "--dtype", + type=str, + default="float32", + help="Data type, one of float32, float16, bfloat16, int8, int4, default float32", + ) + parser.add_argument( + "--monitor_tensors", + action="store_true", + help="Monitor tensor sizes during training for a single training step, off by default", + ) + parser.add_argument("--max_seq_length", type=int, default=128, help="Maximum sequence length, default 128") + parser.add_argument("--batch_size", type=int, default=1, help="Batch size, default 1") + parser.add_argument("--max_steps", type=int, default=50, help="Maximum number of training steps, default 50") + parser.add_argument("--path_config", type=str, default=None, help="Path to LoRA config") + args = parser.parse_args() + train( + model_id=args.model_id, + rank=args.rank, + dtype=args.dtype, + monitor_tensors=args.monitor_tensors, + max_seq_length=args.max_seq_length, + batch_size=args.batch_size, + max_steps=args.max_steps, + path_config=args.path_config, + ) diff --git a/peft/setup.py b/peft/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..a677afe0117b5e831dbadd1319bea69834fe9d83 --- /dev/null +++ b/peft/setup.py @@ -0,0 +1,110 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import find_packages, setup + + +VERSION = "0.17.2.dev0" + +extras = {} +extras["quality"] = [ + "black", # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434 + "hf-doc-builder", + "ruff~=0.12.8", +] +extras["docs_specific"] = [ + "black", # doc-builder has an implicit dependency on Black, see huggingface/doc-builder#434 + "hf-doc-builder", +] +extras["dev"] = extras["quality"] + extras["docs_specific"] +extras["test"] = extras["dev"] + [ + "pytest", + "pytest-cov", + "pytest-xdist", + "parameterized", + "datasets", + "diffusers", + "scipy", + "protobuf", + "sentencepiece", +] + +setup( + name="peft", + version=VERSION, + description="Parameter-Efficient Fine-Tuning (PEFT)", + license_files=["LICENSE"], + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + keywords="deep learning", + license="Apache", + author="The HuggingFace team", + author_email="benjamin@huggingface.co", + url="https://github.com/huggingface/peft", + package_dir={"": "src"}, + packages=find_packages("src"), + package_data={"peft": ["py.typed", "tuners/boft/fbd/fbd_cuda.cpp", "tuners/boft/fbd/fbd_cuda_kernel.cu"]}, + entry_points={}, + python_requires=">=3.10.0", + install_requires=[ + "numpy>=1.17", + "packaging>=20.0", + "psutil", + "pyyaml", + "torch>=1.13.0", + "transformers", + "tqdm", + "accelerate>=0.21.0", + "safetensors", + "huggingface_hub>=0.25.0", + ], + extras_require=extras, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], +) + +# Release checklist +# 1. Change the version in __init__.py and setup.py to the release version, e.g. from "0.6.1.dev0" to "0.7.0" +# 2. Check if there are any deprecations that need to be addressed for this release by searching for "# TODO" in the code +# 3. Commit these changes with the message: "Release: VERSION", create a PR and merge it. +# 4. Add a tag in git to mark the release: "git tag -a v -m 'Adds tag for pypi' " +# Push the tag to git: +# git push --tags origin main +# It is necessary to work on the original repository, not on a fork. +# 5. Run the following commands in the top-level directory: +# python setup.py bdist_wheel +# python setup.py sdist +# Ensure that you are on the clean and up-to-date main branch (git status --untracked-files=no should not list any +# files and show the main branch) +# 6. Upload the package to the pypi test server first: +# twine upload dist/* -r pypitest +# 7. Check that you can install it in a virtualenv by running: +# pip install -i https://testpypi.python.org/pypi --extra-index-url https://pypi.org/simple peft +# 8. Upload the final version to actual pypi: +# twine upload dist/* -r pypi +# 9. Add release notes to the tag on https://github.com/huggingface/peft/releases once everything is looking hunky-dory. +# Check the notes here: https://docs.google.com/document/d/1k-sOIfykuKjWcOIALqjhFKz4amFEp-myeJUJEzNgjoU/edit?usp=sharing +# 10. Update the version in __init__.py, setup.py to the bumped patch version + ".dev0" (e.g. from "0.7.0" to "0.7.1.dev0") diff --git a/peft/src/peft/__init__.py b/peft/src/peft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..af26f8309b0a2637a484f5bfa73f838de61d8ef3 --- /dev/null +++ b/peft/src/peft/__init__.py @@ -0,0 +1,236 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.17.2.dev0" + +from .auto import ( + MODEL_TYPE_TO_PEFT_MODEL_MAPPING, + AutoPeftModel, + AutoPeftModelForCausalLM, + AutoPeftModelForFeatureExtraction, + AutoPeftModelForQuestionAnswering, + AutoPeftModelForSeq2SeqLM, + AutoPeftModelForSequenceClassification, + AutoPeftModelForTokenClassification, +) +from .config import PeftConfig, PromptLearningConfig +from .mapping import ( + PEFT_TYPE_TO_CONFIG_MAPPING, + PEFT_TYPE_TO_MIXED_MODEL_MAPPING, + PEFT_TYPE_TO_TUNER_MAPPING, + get_peft_config, + inject_adapter_in_model, +) +from .mapping_func import get_peft_model +from .mixed_model import PeftMixedModel +from .peft_model import ( + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, + get_layer_status, + get_model_status, +) +from .tuners import ( + AdaLoraConfig, + AdaLoraModel, + AdaptionPromptConfig, + AdaptionPromptModel, + ArrowConfig, + BOFTConfig, + BOFTModel, + BoneConfig, + BoneModel, + C3AConfig, + C3AModel, + CPTConfig, + CPTEmbedding, + EvaConfig, + FourierFTConfig, + FourierFTModel, + HRAConfig, + HRAModel, + IA3Config, + IA3Model, + LNTuningConfig, + LNTuningModel, + LoftQConfig, + LoHaConfig, + LoHaModel, + LoKrConfig, + LoKrModel, + LoraConfig, + LoraModel, + LoraRuntimeConfig, + MissConfig, + MissModel, + MultitaskPromptTuningConfig, + MultitaskPromptTuningInit, + OFTConfig, + OFTModel, + PolyConfig, + PolyModel, + PrefixEncoder, + PrefixTuningConfig, + PromptEmbedding, + PromptEncoder, + PromptEncoderConfig, + PromptEncoderReparameterizationType, + PromptTuningConfig, + PromptTuningInit, + RandLoraConfig, + RandLoraModel, + RoadConfig, + RoadModel, + ShiraConfig, + ShiraModel, + TrainableTokensConfig, + TrainableTokensModel, + VBLoRAConfig, + VBLoRAModel, + VeraConfig, + VeraModel, + WaveFTConfig, + WaveFTModel, + XLoraConfig, + XLoraModel, + create_arrow_model, + get_eva_state_dict, + initialize_lora_eva_weights, +) +from .utils import ( + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + PeftType, + PeftWarning, + TaskType, + bloom_model_postprocess_past_key_value, + cast_mixed_precision_params, + get_peft_model_state_dict, + load_peft_weights, + prepare_model_for_kbit_training, + replace_lora_weights_loftq, + set_peft_model_state_dict, + shift_tokens_right, +) + + +__all__ = [ + "MODEL_TYPE_TO_PEFT_MODEL_MAPPING", + "PEFT_TYPE_TO_CONFIG_MAPPING", + "PEFT_TYPE_TO_MIXED_MODEL_MAPPING", + "PEFT_TYPE_TO_TUNER_MAPPING", + "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING", + "AdaLoraConfig", + "AdaLoraModel", + "AdaptionPromptConfig", + "AdaptionPromptModel", + "ArrowConfig", + "AutoPeftModel", + "AutoPeftModelForCausalLM", + "AutoPeftModelForFeatureExtraction", + "AutoPeftModelForQuestionAnswering", + "AutoPeftModelForSeq2SeqLM", + "AutoPeftModelForSequenceClassification", + "AutoPeftModelForTokenClassification", + "BOFTConfig", + "BOFTModel", + "BoneConfig", + "BoneModel", + "C3AConfig", + "C3AModel", + "CPTConfig", + "CPTEmbedding", + "EvaConfig", + "FourierFTConfig", + "FourierFTModel", + "HRAConfig", + "HRAModel", + "IA3Config", + "IA3Model", + "LNTuningConfig", + "LNTuningModel", + "LoHaConfig", + "LoHaModel", + "LoKrConfig", + "LoKrModel", + "LoftQConfig", + "LoraConfig", + "LoraModel", + "LoraRuntimeConfig", + "MissConfig", + "MissModel", + "MultitaskPromptTuningConfig", + "MultitaskPromptTuningInit", + "OFTConfig", + "OFTModel", + "PeftConfig", + "PeftMixedModel", + "PeftModel", + "PeftModelForCausalLM", + "PeftModelForFeatureExtraction", + "PeftModelForQuestionAnswering", + "PeftModelForSeq2SeqLM", + "PeftModelForSequenceClassification", + "PeftModelForTokenClassification", + "PeftType", + "PeftWarning", + "PolyConfig", + "PolyModel", + "PrefixEncoder", + "PrefixTuningConfig", + "PromptEmbedding", + "PromptEncoder", + "PromptEncoderConfig", + "PromptEncoderReparameterizationType", + "PromptLearningConfig", + "PromptTuningConfig", + "PromptTuningInit", + "RandLoraConfig", + "RandLoraModel", + "RoadConfig", + "RoadModel", + "ShiraConfig", + "ShiraModel", + "TaskType", + "TrainableTokensConfig", + "TrainableTokensModel", + "VBLoRAConfig", + "VBLoRAConfig", + "VBLoRAModel", + "VeraConfig", + "VeraModel", + "WaveFTConfig", + "WaveFTModel", + "XLoraConfig", + "XLoraModel", + "bloom_model_postprocess_past_key_value", + "cast_mixed_precision_params", + "create_arrow_model", + "get_eva_state_dict", + "get_layer_status", + "get_model_status", + "get_peft_config", + "get_peft_model", + "get_peft_model_state_dict", + "initialize_lora_eva_weights", + "inject_adapter_in_model", + "load_peft_weights", + "prepare_model_for_kbit_training", + "replace_lora_weights_loftq", + "set_peft_model_state_dict", + "shift_tokens_right", +] diff --git a/peft/src/peft/auto.py b/peft/src/peft/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..613f67c707e344eab1a5281565fb0fdb3d827d01 --- /dev/null +++ b/peft/src/peft/auto.py @@ -0,0 +1,184 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import importlib +import os +from typing import Optional + +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForQuestionAnswering, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoTokenizer, +) + +from .config import PeftConfig +from .peft_model import ( + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, +) +from .utils.constants import TOKENIZER_CONFIG_NAME +from .utils.other import check_file_exists_on_hf_hub + + +MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = { + "SEQ_CLS": PeftModelForSequenceClassification, + "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, + "CAUSAL_LM": PeftModelForCausalLM, + "TOKEN_CLS": PeftModelForTokenClassification, + "QUESTION_ANS": PeftModelForQuestionAnswering, + "FEATURE_EXTRACTION": PeftModelForFeatureExtraction, +} + + +class _BaseAutoPeftModel: + _target_class = None + _target_peft_class = None + + def __init__(self, *args, **kwargs): + # For consistency with transformers: https://github.com/huggingface/transformers/blob/91d7df58b6537d385e90578dac40204cb550f706/src/transformers/models/auto/auto_factory.py#L400 + raise EnvironmentError( # noqa: UP024 + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_config(config)` methods." + ) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + revision: Optional[str] = None, + **kwargs, + ): + r""" + A wrapper around all the preprocessing steps a user needs to perform in order to load a PEFT model. The kwargs + are passed along to `PeftConfig` that automatically takes care of filtering the kwargs of the Hub methods and + the config object init. + """ + peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, revision=revision, **kwargs) + base_model_path = peft_config.base_model_name_or_path + base_model_revision = peft_config.revision + + task_type = getattr(peft_config, "task_type", None) + + if cls._target_class is not None: + target_class = cls._target_class + elif cls._target_class is None and task_type is not None: + # this is only in the case where we use `AutoPeftModel` + raise ValueError( + "Cannot use `AutoPeftModel` with a task type, please use a specific class for your task type. (e.g. `AutoPeftModelForCausalLM` for `task_type='CAUSAL_LM'`)" + ) + + if task_type is not None: + expected_target_class = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[task_type] + if cls._target_peft_class.__name__ != expected_target_class.__name__: + raise ValueError( + f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__}" + " make sure that you are loading the correct model for your task type." + ) + elif task_type is None and getattr(peft_config, "auto_mapping", None) is not None: + auto_mapping = getattr(peft_config, "auto_mapping", None) + base_model_class = auto_mapping["base_model_class"] + parent_library_name = auto_mapping["parent_library"] + + parent_library = importlib.import_module(parent_library_name) + target_class = getattr(parent_library, base_model_class) + else: + raise ValueError( + "Cannot infer the auto class from the config, please make sure that you are loading the correct model for your task type." + ) + + base_model = target_class.from_pretrained(base_model_path, revision=base_model_revision, **kwargs) + + tokenizer_exists = False + if os.path.exists(os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_NAME)): + tokenizer_exists = True + else: + token = kwargs.get("token", None) + if token is None: + token = kwargs.get("use_auth_token", None) + + tokenizer_exists = check_file_exists_on_hf_hub( + repo_id=pretrained_model_name_or_path, + filename=TOKENIZER_CONFIG_NAME, + revision=revision, + repo_type=kwargs.get("repo_type", None), + token=token, + ) + + if tokenizer_exists and hasattr(base_model, "get_input_embeddings"): + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=kwargs.get("trust_remote_code", False) + ) + embedding_size = base_model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + # only resize if the tokenizer has a larger vocab size than there are embeddings + base_model.resize_token_embeddings(len(tokenizer)) + + return cls._target_peft_class.from_pretrained( + base_model, + pretrained_model_name_or_path, + adapter_name=adapter_name, + is_trainable=is_trainable, + config=config, + **kwargs, + ) + + +class AutoPeftModel(_BaseAutoPeftModel): + _target_class = None + _target_peft_class = PeftModel + + +class AutoPeftModelForCausalLM(_BaseAutoPeftModel): + _target_class = AutoModelForCausalLM + _target_peft_class = PeftModelForCausalLM + + +class AutoPeftModelForSeq2SeqLM(_BaseAutoPeftModel): + _target_class = AutoModelForSeq2SeqLM + _target_peft_class = PeftModelForSeq2SeqLM + + +class AutoPeftModelForSequenceClassification(_BaseAutoPeftModel): + _target_class = AutoModelForSequenceClassification + _target_peft_class = PeftModelForSequenceClassification + + +class AutoPeftModelForTokenClassification(_BaseAutoPeftModel): + _target_class = AutoModelForTokenClassification + _target_peft_class = PeftModelForTokenClassification + + +class AutoPeftModelForQuestionAnswering(_BaseAutoPeftModel): + _target_class = AutoModelForQuestionAnswering + _target_peft_class = PeftModelForQuestionAnswering + + +class AutoPeftModelForFeatureExtraction(_BaseAutoPeftModel): + _target_class = AutoModel + _target_peft_class = PeftModelForFeatureExtraction diff --git a/peft/src/peft/config.py b/peft/src/peft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..60a5c20c74bc2b8b97a5e0262e99cc10a79be481 --- /dev/null +++ b/peft/src/peft/config.py @@ -0,0 +1,408 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import importlib.metadata +import inspect +import json +import os +import warnings +from dataclasses import asdict, dataclass, field +from typing import Optional, Union + +import packaging.version +from huggingface_hub import hf_hub_download +from transformers.utils import PushToHubMixin, http_user_agent + +from peft import __version__ + +from .utils import CONFIG_NAME, PeftType, TaskType + + +# we expect at least these keys to be present in a PEFT adapter_config.json +MIN_EXPECTED_CONFIG_KEYS = {"peft_type"} + + +def _check_and_remove_unused_kwargs(cls, kwargs): + """Make PEFT configs forward-compatible by removing unused kwargs that were added in later PEFT versions. + + This assumes that removing the unused kwargs will not affect the default behavior. + + Returns the filtered kwargs and the set of removed keys. + """ + # it's not pretty but eh + signature_parameters = inspect.signature(cls.__init__).parameters + unexpected_kwargs = set(kwargs.keys()) - set(signature_parameters.keys()) + for key in unexpected_kwargs: + del kwargs[key] + return kwargs, unexpected_kwargs + + +def _is_dev_version(version: str) -> bool: + # check if the given version is a dev version + return packaging.version.Version(version).dev is not None + + +def _get_commit_hash(pkg_name: str) -> str | None: + # If PEFT was installed from a specific commit hash, try to get it. This works e.g. when installing PEFT with `pip + # install git+https://github.com/huggingface/peft.git@`. This works not for other means, like editable + # installs. + try: + dist = importlib.metadata.distribution(pkg_name) + except importlib.metadata.PackageNotFoundError: + return None + + # See: https://packaging.python.org/en/latest/specifications/direct-url/ + for path in dist.files or []: + if path.name == "direct_url.json": + direct_url = json.loads((dist.locate_file(path)).read_text()) + vcs_info = direct_url.get("vcs_info") + if vcs_info and "commit_id" in vcs_info: + return vcs_info["commit_id"] + return None + + +@dataclass +class PeftConfigMixin(PushToHubMixin): + r""" + This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all + PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to + push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a + directory. The method `from_pretrained` will load the configuration of your adapter model from a directory. + + Args: + peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use. + """ + + task_type: Optional[TaskType] = field(default=None, metadata={"help": "The type of task."}) + peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."}) + auto_mapping: Optional[dict] = field( + default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."} + ) + peft_version: Optional[str] = field(default=None, metadata={"help": "PEFT version, leave empty to auto-fill."}) + + def __post_init__(self): + # check for invalid task type + if (self.task_type is not None) and (self.task_type not in list(TaskType)): + raise ValueError( + f"Invalid task type: '{self.task_type}'. Must be one of the following task types: {', '.join(TaskType)}." + ) + if self.peft_version is None: + self.peft_version = self._get_peft_version() + + @staticmethod + def _get_peft_version() -> str: + # gets the current peft version; if it's a dev version, try to get the commit hash too, as the dev version is + # ambiguous + version = __version__ + if not _is_dev_version(version): + return version + + try: + git_hash = _get_commit_hash("peft") + if git_hash is None: + git_hash = "UNKNOWN" + except Exception: + # Broad exception: We never want to break user code just because the git_hash could not be determined + warnings.warn( + "A dev version of PEFT is used but there was an error while trying to determine the commit hash. " + "Please open an issue: https://github.com/huggingface/peft/issues" + ) + git_hash = "UNKNOWN" + version = version + f"@{git_hash}" + return version + + def to_dict(self) -> dict: + r""" + Returns the configuration for your adapter model as a dictionary. + """ + return asdict(self) + + def save_pretrained(self, save_directory: str, **kwargs) -> None: + r""" + This method saves the configuration of your adapter model in a directory. + + Args: + save_directory (`str`): + The directory where the configuration will be saved. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the [`~transformers.utils.PushToHubMixin.push_to_hub`] + method. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + auto_mapping_dict = kwargs.pop("auto_mapping_dict", None) + + output_dict = self.to_dict() + # converting set type to list + for key, value in output_dict.items(): + if isinstance(value, set): + output_dict[key] = list(value) + + output_path = os.path.join(save_directory, CONFIG_NAME) + + # Add auto mapping details for custom models. + if auto_mapping_dict is not None: + output_dict["auto_mapping"] = auto_mapping_dict + + # save it + with open(output_path, "w") as writer: + writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) + + @classmethod + def from_peft_type(cls, **kwargs): + r""" + This method loads the configuration of your adapter model from a set of kwargs. + + The appropriate configuration type is determined by the `peft_type` argument. If `peft_type` is not provided, + the calling class type is instantiated. + + Args: + kwargs (configuration keyword arguments): + Keyword arguments passed along to the configuration initialization. + """ + # Avoid circular dependency .. TODO: fix this with a larger refactor + from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + # TODO: this hack is needed to fix the following issue (on commit 702f937): + # if someone saves a default config and loads it back with `PeftConfig` class it yields to + # not loading the correct config class. + # + # from peft import AdaLoraConfig, PeftConfig + # peft_config = AdaLoraConfig() + # print(peft_config) + # >>> AdaLoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, + # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ... + # + # peft_config.save_pretrained("./test_config") + # peft_config = PeftConfig.from_pretrained("./test_config") + # print(peft_config) + # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False) + + if "peft_type" in kwargs: + peft_type = kwargs["peft_type"] + config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] + else: + config_cls = cls + + try: + config = config_cls(**kwargs) + except TypeError as exc: + # Here we potentially handle forward compatibility. Sometimes new keywords are added to configs, which makes + # new configs incompatible with older PEFT versions. We catch these and remove them to allow the program to + # continue, but warn the user about it. + + # First check if the error is due to unexpected keyword arguments, we don't want to accidentally catch + # other TypeErrors. + if "got an unexpected keyword argument" not in str(exc): + raise exc + + filtered_kwargs, unexpected_kwargs = _check_and_remove_unused_kwargs(config_cls, kwargs) + if not MIN_EXPECTED_CONFIG_KEYS.issubset(set(filtered_kwargs.keys())): + raise TypeError( + f"The {cls.__name__} config that is trying to be loaded is missing required keys: " + f"{MIN_EXPECTED_CONFIG_KEYS}." + ) + + warnings.warn( + f"Unexpected keyword arguments {sorted(unexpected_kwargs)} for class {config_cls.__name__}, these are " + "ignored. This probably means that you're loading a configuration file that was saved using a " + "higher version of the library and additional parameters have been introduced since. It is " + "highly recommended to upgrade the PEFT version before continuing (e.g. by running `pip install " + "-U peft`)." + ) + config = config_cls.from_peft_type(**filtered_kwargs) + return config + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs): + r""" + This method loads the configuration of your adapter model from a directory. + + Args: + pretrained_model_name_or_path (`str`): + The directory or the Hub repository id where the configuration is saved. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the child class initialization. + """ + path = ( + os.path.join(pretrained_model_name_or_path, subfolder) + if subfolder is not None + else pretrained_model_name_or_path + ) + + hf_hub_download_kwargs, class_kwargs, _ = cls._split_kwargs(kwargs) + if "user_agent" not in hf_hub_download_kwargs: + hf_hub_download_kwargs["user_agent"] = http_user_agent() + + if os.path.isfile(os.path.join(path, CONFIG_NAME)): + config_file = os.path.join(path, CONFIG_NAME) + else: + try: + config_file = hf_hub_download( + pretrained_model_name_or_path, CONFIG_NAME, subfolder=subfolder, **hf_hub_download_kwargs + ) + except Exception as exc: + raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'") from exc + + loaded_attributes = cls.from_json_file(config_file) + kwargs = {**class_kwargs, **loaded_attributes} + kwargs = cls.check_kwargs(**kwargs) + return cls.from_peft_type(**kwargs) + + @classmethod + def from_json_file(cls, path_json_file: str, **kwargs): + r""" + Loads a configuration file from a json file. + + Args: + path_json_file (`str`): + The path to the json file. + """ + with open(path_json_file) as file: + json_object = json.load(file) + + # Sanity check that config does not contain a runtime_config + if "runtime_config" in json_object: + warnings.warn( + "The configuration file contains a `runtime_config` key. This is ignored. Runtime configurations are only valid at runtime." + ) + del json_object["runtime_config"] + + return json_object + + @classmethod + def _split_kwargs(cls, kwargs): + hf_hub_download_kwargs = {} + class_kwargs = {} + other_kwargs = {} + + for key, value in kwargs.items(): + if key in inspect.signature(hf_hub_download).parameters: + hf_hub_download_kwargs[key] = value + elif key in list(cls.__annotations__): + class_kwargs[key] = value + else: + other_kwargs[key] = value + + return hf_hub_download_kwargs, class_kwargs, other_kwargs + + @classmethod + def _get_peft_type( + cls, + model_id: str, + **hf_hub_download_kwargs, + ): + subfolder = hf_hub_download_kwargs.get("subfolder", None) + + path = os.path.join(model_id, subfolder) if subfolder is not None else model_id + + if os.path.isfile(os.path.join(path, CONFIG_NAME)): + config_file = os.path.join(path, CONFIG_NAME) + else: + try: + config_file = hf_hub_download( + model_id, + CONFIG_NAME, + **hf_hub_download_kwargs, + ) + except Exception: + raise ValueError(f"Can't find '{CONFIG_NAME}' at '{model_id}'") + + loaded_attributes = cls.from_json_file(config_file) + return loaded_attributes["peft_type"] + + @classmethod + def check_kwargs(cls, **kwargs): + """Check kwargs before initializing the config instance. + + Subclasses can override this method to add specific checks. + + """ + return kwargs + + @property + def is_prompt_learning(self) -> bool: + r""" + Utility method to check if the configuration is for prompt learning. + """ + return False + + @property + def is_adaption_prompt(self) -> bool: + """Return True if this is an adaption prompt config.""" + return False + + +@dataclass +class PeftConfig(PeftConfigMixin): + """ + This is the base configuration class to store the configuration of a [`PeftModel`]. + + Args: + peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use. + task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform. + inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode. + """ + + base_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "The name of the base model to use."} + ) + revision: Optional[str] = field(default=None, metadata={"help": "The specific base model version to use."}) + peft_type: Optional[Union[str, PeftType]] = field(default=None, metadata={"help": "Peft type"}) + task_type: Optional[Union[str, TaskType]] = field(default=None, metadata={"help": "Task type"}) + inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"}) + + +@dataclass +class PromptLearningConfig(PeftConfig): + """ + This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or + [`PromptTuning`]. + + Args: + num_virtual_tokens (`int`): The number of virtual tokens to use. + token_dim (`int`): The hidden embedding dimension of the base transformer model. + num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model. + num_attention_heads (`int`): The number of attention heads in the base transformer model. + num_layers (`int`): The number of layers in the base transformer model. + """ + + num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"}) + token_dim: int = field( + default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"} + ) + num_transformer_submodules: Optional[int] = field( + default=None, metadata={"help": "Number of transformer submodules"} + ) + num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"}) + num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of extra modules to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved. " + "The module(s) will be fully fine-tuned." + }, + ) + + @property + def is_prompt_learning(self) -> bool: + r""" + Utility method to check if the configuration is for prompt learning. + """ + return True diff --git a/peft/src/peft/functional.py b/peft/src/peft/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..60df690cafe1e9a3b5c8ff09d550f82110afe593 --- /dev/null +++ b/peft/src/peft/functional.py @@ -0,0 +1,34 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Functions that are useful for integration with non-PeftModel models, e.g. transformers or diffusers. + +The functions provided here can be considered "public API" of PEFT and hence are safe to be used by packages that +provide PEFT integrations. +""" + +from peft.mapping import inject_adapter_in_model +from peft.tuners.tuners_utils import cast_adapter_dtype, delete_adapter, set_adapter, set_requires_grad +from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict + + +__all__ = [ + "cast_adapter_dtype", + "delete_adapter", + "get_peft_model_state_dict", + "inject_adapter_in_model", + "set_adapter", + "set_peft_model_state_dict", + "set_requires_grad", +] diff --git a/peft/src/peft/helpers.py b/peft/src/peft/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..d748c62e696d57034e4e9fd6458b27febbd9c90c --- /dev/null +++ b/peft/src/peft/helpers.py @@ -0,0 +1,251 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from contextlib import contextmanager +from copy import deepcopy +from functools import update_wrapper +from types import MethodType + +from torch import nn + +from .peft_model import PeftConfig, PeftModel +from .tuners.lora import LoraLayer +from .tuners.tuners_utils import BaseTunerLayer + + +def update_forward_signature(model: PeftModel) -> None: + """ + Updates the forward signature of the PeftModel to include parents class signature + model (`PeftModel`): Peft model to update the forward signature + + Example: + + ```python + >>> from transformers import WhisperForConditionalGeneration + >>> from peft import get_peft_model, LoraConfig, update_forward_signature + + >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + >>> peft_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj", "v_proj"]) + + >>> peft_model = get_peft_model(model, peft_config) + >>> update_forward_signature(peft_model) + ``` + """ + + # Only update signature when the current forward signature only has *args and **kwargs + current_signature = inspect.signature(model.forward) + if ( + len(current_signature.parameters) == 2 + and "args" in current_signature.parameters + and "kwargs" in current_signature.parameters + ): + forward = deepcopy(model.forward.__func__) + update_wrapper( + forward, type(model.get_base_model()).forward, assigned=("__doc__", "__name__", "__annotations__") + ) + model.forward = MethodType(forward, model) + + +def update_generate_signature(model: PeftModel) -> None: + """ + Updates the generate signature of a PeftModel with overriding generate to include parents class signature + model (`PeftModel`): Peft model to update the generate signature + + Example: + + ```python + >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + >>> from peft import get_peft_model, LoraConfig, TaskType, update_generate_signature + + >>> model_name_or_path = "bigscience/mt0-large" + >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + + >>> peft_config = LoraConfig( + ... task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 + ... ) + >>> peft_model = get_peft_model(model, peft_config) + >>> update_generate_signature(peft_model) + >>> help(peft_model.generate) + ``` + """ + if not hasattr(model, "generate"): + return + current_signature = inspect.signature(model.generate) + if ( + len(current_signature.parameters) == 2 + and "args" in current_signature.parameters + and "kwargs" in current_signature.parameters + ) or (len(current_signature.parameters) == 1 and "kwargs" in current_signature.parameters): + generate = deepcopy(model.generate.__func__) + update_wrapper( + generate, + type(model.get_base_model()).generate, + assigned=("__doc__", "__name__", "__annotations__"), + ) + model.generate = MethodType(generate, model) + + +def update_signature(model: PeftModel, method: str = "all") -> None: + """ + Updates the signature of a PeftModel include parents class signature for forward or generate method + model (`PeftModel`): Peft model to update generate or forward signature method (`str`): method to update + signature choose one of "forward", "generate", "all" + + Example: + ```python + >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + >>> from peft import get_peft_model, LoraConfig, TaskType, update_signature + + >>> model_name_or_path = "bigscience/mt0-large" + >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + + >>> peft_config = LoraConfig( + ... task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 + ... ) + >>> peft_model = get_peft_model(model, peft_config) + >>> update_signature(peft_model) + >>> help(peft_model.generate) + ``` + """ + if method == "forward": + update_forward_signature(model) + elif method == "generate": + update_generate_signature(model) + elif method == "all": + update_forward_signature(model) + update_generate_signature(model) + else: + raise ValueError(f"method {method} is not supported please choose one of ['forward', 'generate', 'all']") + + +def check_if_peft_model(model_name_or_path: str) -> bool: + """ + Check if the model is a PEFT model. + + Args: + model_name_or_path (`str`): + Model id to check, can be local or on the Hugging Face Hub. + + Returns: + `bool`: True if the model is a PEFT model, False otherwise. + """ + is_peft_model = True + try: + PeftConfig.from_pretrained(model_name_or_path) + except Exception: + # allow broad exceptions so that this works even if new exceptions are added on HF Hub side + is_peft_model = False + + return is_peft_model + + +@contextmanager +def rescale_adapter_scale(model, multiplier): + """ + Context manager to temporarily rescale the scaling of the LoRA adapter in a model. + + The original scaling values are restored when the context manager exits. This context manager works with the + transformers and diffusers models that have directly loaded LoRA adapters. + + For LoRA, applying this context manager with multiplier in [0, 1] is strictly equivalent to applying + [wise-ft](https://huggingface.co/papers/2109.01903) (see [#1940](https://github.com/huggingface/peft/issues/1940) + for details). It can improve the performances of the model if there is a distribution shiftbetween the training + data used for fine-tuning, and the test data used during inference. + + Warning: It has been reported that when using Apple's MPS backend for PyTorch, it is necessary to add a short sleep + time after exiting the context before the scales are fully restored. + + Args: + model: The model containing `LoraLayer` modules whose scaling is to be adjusted. + multiplier (float or int): + The multiplier that rescales the `scaling` attribute. Must be of type float or int. + + Raises: + ValueError: If the model does not contain any `LoraLayer` + instances, indicating that the model does not support scaling. + + Example: + + ```python + >>> model = ModelWithLoraLayer() + >>> multiplier = 0.5 + >>> with rescale_adapter_scale(model, multiplier): + ... outputs = model(**inputs) # Perform operations with the scaled model + >>> outputs = model(**inputs) # The original scaling values are restored here + ``` + """ + # check if multiplier has a valid data type + if not isinstance(multiplier, (float, int)): + raise TypeError(f"Argument multiplier should be of type float, got {type(multiplier)}") + + # iterate on the model's modules and grab the original scaling attribute + # from the lora layers if present + original_scaling = {} + for module in model.modules(): + if isinstance(module, LoraLayer): + original_scaling[module] = module.scaling.copy() + module.scaling = {k: v * multiplier for k, v in module.scaling.items()} + + # check whether scaling is prohibited on model + # the original scaling dictionary should be empty + # if there were no lora layers + if not original_scaling: + raise ValueError("scaling is only supported for models with `LoraLayer`s") + try: + yield + + finally: + # restore original scaling values after exiting the context + for module, scaling in original_scaling.items(): + module.scaling = scaling + + +@contextmanager +def disable_input_dtype_casting(model: nn.Module, active: bool = True): + """ + Context manager disables input dtype casting to the dtype of the weight. + + Parameters: + model (nn.Module): + The model containing PEFT modules whose input dtype casting is to be adjusted. + active (bool): + Whether the context manager is active (default) or inactive. + + """ + # Additional info: Normally, the dtype of the weight and input need to match, which is why the dtype is cast. + # However, in certain circumustances, this is handled by forward hooks, e.g. when using layerwise casting in + # diffusers. In that case, PEFT casting the dtype interferes with the layerwise casting, which is why the option to + # disable it is given. + if not active: + yield + return + + original_values = {} + for name, module in model.named_modules(): + if not isinstance(module, BaseTunerLayer): + continue + original_values[name] = module.cast_input_dtype_enabled + module.cast_input_dtype_enabled = False + + try: + yield + finally: + for name, module in model.named_modules(): + if not isinstance(module, BaseTunerLayer): + continue + if name in original_values: + module.cast_input_dtype_enabled = original_values[name] diff --git a/peft/src/peft/import_utils.py b/peft/src/peft/import_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6aa69a85190bc91bb8bd0649fa80806d7209c584 --- /dev/null +++ b/peft/src/peft/import_utils.py @@ -0,0 +1,172 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +import importlib.metadata as importlib_metadata +import platform +from functools import lru_cache + +import packaging.version +import torch + + +@lru_cache +def is_bnb_available() -> bool: + return importlib.util.find_spec("bitsandbytes") is not None + + +@lru_cache +def is_bnb_4bit_available() -> bool: + if not is_bnb_available(): + return False + + import bitsandbytes as bnb + + return hasattr(bnb.nn, "Linear4bit") + + +@lru_cache +def is_auto_gptq_available(): + if importlib.util.find_spec("auto_gptq") is not None: + AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0") + version_autogptq = packaging.version.parse(importlib_metadata.version("auto_gptq")) + if AUTOGPTQ_MINIMUM_VERSION <= version_autogptq: + return True + else: + raise ImportError( + f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, " + f"but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported" + ) + + +@lru_cache +def is_gptqmodel_available(): + if importlib.util.find_spec("gptqmodel") is not None: + GPTQMODEL_MINIMUM_VERSION = packaging.version.parse("2.0.0") + OPTIMUM_MINIMUM_VERSION = packaging.version.parse("1.24.0") + version_gptqmodel = packaging.version.parse(importlib_metadata.version("gptqmodel")) + if GPTQMODEL_MINIMUM_VERSION <= version_gptqmodel: + if is_optimum_available(): + version_optimum = packaging.version.parse(importlib_metadata.version("optimum")) + if OPTIMUM_MINIMUM_VERSION <= version_optimum: + return True + else: + raise ImportError( + f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher. Found version `{version_optimum}`, " + f"but only versions above `{OPTIMUM_MINIMUM_VERSION}` are supported" + ) + else: + raise ImportError( + f"gptqmodel requires optimum version `{OPTIMUM_MINIMUM_VERSION}` or higher to be installed." + ) + else: + raise ImportError( + f"Found an incompatible version of gptqmodel. Found version `{version_gptqmodel}`, " + f"but only versions above `{GPTQMODEL_MINIMUM_VERSION}` are supported" + ) + + +@lru_cache +def is_optimum_available() -> bool: + return importlib.util.find_spec("optimum") is not None + + +@lru_cache +def is_torch_tpu_available(check_device=True): + "Checks if `torch_xla` is installed and potentially if a TPU is in the environment" + if importlib.util.find_spec("torch_xla") is not None: + if check_device: + # We need to check if `xla_device` can be found, will raise a RuntimeError if not + try: + import torch_xla.core.xla_model as xm + + _ = xm.xla_device() + return True + except RuntimeError: + return False + return True + return False + + +@lru_cache +def is_aqlm_available(): + return importlib.util.find_spec("aqlm") is not None + + +@lru_cache +def is_auto_awq_available(): + return importlib.util.find_spec("awq") is not None + + +@lru_cache +def is_eetq_available(): + return importlib.util.find_spec("eetq") is not None + + +@lru_cache +def is_hqq_available(): + return importlib.util.find_spec("hqq") is not None + + +@lru_cache +def is_inc_available(): + return importlib.util.find_spec("neural_compressor") is not None + + +@lru_cache +def is_torchao_available(): + if importlib.util.find_spec("torchao") is None: + return False + + TORCHAO_MINIMUM_VERSION = packaging.version.parse("0.4.0") + try: + torchao_version = packaging.version.parse(importlib_metadata.version("torchao")) + except importlib_metadata.PackageNotFoundError: + # Same idea as in diffusers: + # https://github.com/huggingface/diffusers/blob/9f06a0d1a4a998ac6a463c5be728c892f95320a8/src/diffusers/utils/import_utils.py#L351-L357 + # It's not clear under what circumstances `importlib_metadata.version("torchao")` can raise an error even + # though `importlib.util.find_spec("torchao") is not None` but it has been observed, so adding this for + # precaution. + return False + + if torchao_version < TORCHAO_MINIMUM_VERSION: + raise ImportError( + f"Found an incompatible version of torchao. Found version {torchao_version}, " + f"but only versions above {TORCHAO_MINIMUM_VERSION} are supported" + ) + return True + + +@lru_cache +def is_xpu_available(check_device=False): + """ + Checks if XPU acceleration is available and potentially if a XPU is in the environment + """ + + system = platform.system() + if system == "Darwin": + return False + else: + if check_device: + try: + # Will raise a RuntimeError if no XPU is found + _ = torch.xpu.device_count() + return torch.xpu.is_available() + except RuntimeError: + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + + +@lru_cache +def is_diffusers_available(): + return importlib.util.find_spec("diffusers") is not None diff --git a/peft/src/peft/mapping.py b/peft/src/peft/mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..82c6ec1e40bb7b06f505b82346245b8bf00d789c --- /dev/null +++ b/peft/src/peft/mapping.py @@ -0,0 +1,92 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from .utils import PeftType + + +if TYPE_CHECKING: + from .config import PeftConfig + from .tuners.tuners_utils import BaseTuner + + +# these will be filled by the register_peft_method function +PEFT_TYPE_TO_CONFIG_MAPPING: dict[PeftType, type[PeftConfig]] = {} +PEFT_TYPE_TO_TUNER_MAPPING: dict[PeftType, type[BaseTuner]] = {} +PEFT_TYPE_TO_MIXED_MODEL_MAPPING: dict[PeftType, type[BaseTuner]] = {} +PEFT_TYPE_TO_PREFIX_MAPPING: dict[PeftType, str] = {} + + +def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig: + """ + Returns a Peft config object from a dictionary. + + Args: + config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters. + """ + + return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict) + + +def inject_adapter_in_model( + peft_config: PeftConfig, + model: torch.nn.Module, + adapter_name: str = "default", + low_cpu_mem_usage: bool = False, + state_dict: Optional[dict[str, torch.Tensor]] = None, +) -> torch.nn.Module: + r""" + Create PEFT layers and inject them into the model in-place. + + Currently the API does not support prompt learning methods and adaption prompt. + + This function is similar to [`get_peft_model`] but it does not return a [`PeftModel`] instance. Instead, it returns + the original, mutated instance of the passed model. + + Args: + peft_config (`PeftConfig`): + Configuration object containing the parameters of the PEFT model. + model (`torch.nn.Module`): + The input model where the adapter will be injected. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the adapter to be injected, if not provided, the default adapter name is used ("default"). + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + state_dict (`dict`, *optional*, defaults to `None`) + If a `state_dict` is passed here, the adapters will be injected based on the entries of the state_dict. + This can be useful when the exact `target_modules` of the PEFT method is unknown, for instance because the + checkpoint was created without meta data. Note that the values from the `state_dict` are not used, only the + keys are used to determine the correct layers that should be adapted. + """ + if peft_config.is_prompt_learning or peft_config.is_adaption_prompt: + raise ValueError("`create_and_replace` does not support prompt learning and adaption prompt yet.") + + if peft_config.peft_type not in PEFT_TYPE_TO_TUNER_MAPPING.keys(): + raise ValueError( + f"`inject_adapter_in_model` does not support {peft_config.peft_type} yet. Please use `get_peft_model`." + ) + + tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type] + + # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules. + peft_model = tuner_cls( + model, peft_config, adapter_name=adapter_name, low_cpu_mem_usage=low_cpu_mem_usage, state_dict=state_dict + ) + + return peft_model.model diff --git a/peft/src/peft/mapping_func.py b/peft/src/peft/mapping_func.py new file mode 100644 index 0000000000000000000000000000000000000000..adcb55a8e5c3ac587f9b6c39482e99804cc981f4 --- /dev/null +++ b/peft/src/peft/mapping_func.py @@ -0,0 +1,131 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from typing import Optional + +from transformers import PreTrainedModel + +from .auto import MODEL_TYPE_TO_PEFT_MODEL_MAPPING +from .config import PeftConfig +from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING +from .mixed_model import PeftMixedModel +from .peft_model import PeftModel +from .tuners.tuners_utils import BaseTuner, BaseTunerLayer +from .utils import _prepare_prompt_learning_config + + +def get_peft_model( + model: PreTrainedModel, + peft_config: PeftConfig, + adapter_name: str = "default", + mixed: bool = False, + autocast_adapter_dtype: bool = True, + revision: Optional[str] = None, + low_cpu_mem_usage: bool = False, +) -> PeftModel | PeftMixedModel: + """ + Returns a Peft model object from a model and a config, where the model will be modified in-place. + + Args: + model ([`transformers.PreTrainedModel`]): + Model to be wrapped. + peft_config ([`PeftConfig`]): + Configuration object containing the parameters of the Peft model. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the adapter to be injected, if not provided, the default adapter name is used ("default"). + mixed (`bool`, `optional`, defaults to `False`): + Whether to allow mixing different (compatible) adapter types. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 or bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + revision (`str`, `optional`, defaults to `main`): + The revision of the base model. If this isn't set, the saved peft model will load the `main` revision for + the base model + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. Leave this setting as + False if you intend on training the model, unless the adapter weights will be replaced by different weights + before training starts. + """ + model_config = BaseTuner.get_model_config(model) + old_name = peft_config.base_model_name_or_path + new_name = model.__dict__.get("name_or_path", None) + peft_config.base_model_name_or_path = new_name + + # Especially in notebook environments there could be a case that a user wants to experiment with different + # configuration values. However, it is likely that there won't be any changes for new configs on an already + # initialized PEFT model. The best we can do is warn the user about it. + if any(isinstance(module, BaseTunerLayer) for module in model.modules()): + warnings.warn( + "You are trying to modify a model with PEFT for a second time. If you want to reload the model with a " + "different config, make sure to call `.unload()` before." + ) + + if (old_name is not None) and (old_name != new_name): + warnings.warn( + f"The PEFT config's `base_model_name_or_path` was renamed from '{old_name}' to '{new_name}'. " + "Please ensure that the correct base model is loaded when loading this checkpoint." + ) + + if revision is not None: + if peft_config.revision is not None and peft_config.revision != revision: + warnings.warn( + f"peft config has already set base model revision to {peft_config.revision}, overwriting with revision {revision}" + ) + peft_config.revision = revision + + if ( + (isinstance(peft_config, PEFT_TYPE_TO_CONFIG_MAPPING["LORA"])) + and (peft_config.init_lora_weights == "eva") + and not low_cpu_mem_usage + ): + warnings.warn( + "lora with eva initialization used with low_cpu_mem_usage=False. " + "Setting low_cpu_mem_usage=True can improve the maximum batch size possible for eva initialization." + ) + + prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type) + if prefix and adapter_name in prefix: + warnings.warn( + f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. " + "This may lead to reinitialization of the adapter weights during loading." + ) + + if mixed: + # note: PeftMixedModel does not support autocast_adapter_dtype, so don't pass it + return PeftMixedModel(model, peft_config, adapter_name=adapter_name) + + # We explicitly exclude prompt learning here since prompt learning is specific to the task and needs special + # handling in the PEFT model's forward method. + if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning: + return PeftModel( + model, + peft_config, + adapter_name=adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + if peft_config.is_prompt_learning: + peft_config = _prepare_prompt_learning_config(peft_config, model_config) + return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type]( + model, + peft_config, + adapter_name=adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) diff --git a/peft/src/peft/mixed_model.py b/peft/src/peft/mixed_model.py new file mode 100644 index 0000000000000000000000000000000000000000..c3ad517e7eb65b10b955410eddb2f1962e43e3e1 --- /dev/null +++ b/peft/src/peft/mixed_model.py @@ -0,0 +1,460 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +from accelerate.hooks import remove_hook_from_submodules +from torch import nn +from transformers.utils import PushToHubMixin + +from peft.utils.constants import DUMMY_MODEL_CONFIG + +from .config import PeftConfig +from .peft_model import PeftModel +from .tuners import MixedModel +from .utils import _set_adapter, _set_trainable + + +def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None: + r""" + Prepares the model for gradient checkpointing if necessary + """ + # Note: same as PeftModel._prepare_model_for_gradient_checkpointing + if not getattr(model, "is_gradient_checkpointing", True): + return model + + if not ( + getattr(model, "is_loaded_in_8bit", False) + or getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_quantized", False) + ): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + elif hasattr(model, "get_input_embeddings"): + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + +def _check_config_compatible(peft_config: PeftConfig) -> None: + from .tuners.mixed import COMPATIBLE_TUNER_TYPES + + if peft_config.peft_type not in COMPATIBLE_TUNER_TYPES: + raise ValueError( + f"The provided `peft_type` '{peft_config.peft_type.value}' is not compatible with the `PeftMixedModel`. " + f"Compatible types are: {COMPATIBLE_TUNER_TYPES}" + ) + + +class PeftMixedModel(PushToHubMixin, torch.nn.Module): + """ + PeftMixedModel for loading mixing different types of adapters for inference. + + This class does not support loading/saving, and it shouldn't usually be initialized directly. Instead, use + `get_peft_model` with the argument `mixed=True`. + + > [!TIP] > Read the [Mixed adapter types](https://huggingface.co/docs/peft/en/developer_guides/mixed_models) guide + to learn > more about using different adapter types. + + Example: + + ```py + >>> base_model = ... # load the base model, e.g. from transformers + >>> peft_model = PeftMixedModel.from_pretrained(base_model, path_to_adapter1, "adapter1").eval() + >>> peft_model.load_adapter(path_to_adapter2, "adapter2") + >>> peft_model.set_adapter(["adapter1", "adapter2"]) # activate both adapters + >>> peft_model(data) # forward pass using both adapters + ``` + + Args: + model (`torch.nn.Module`): + The model to be tuned. + config (`PeftConfig`): + The config of the model to be tuned. The adapter type must be compatible. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the first adapter. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + """ + + def __init__(self, model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__() + _check_config_compatible(peft_config) + _prepare_model_for_gradient_checkpointing(model) + self.modules_to_save = None + self.base_model = MixedModel(model, {adapter_name: peft_config}, adapter_name) + self.set_modules_to_save(peft_config, adapter_name) + + self.config = getattr(model, "config", DUMMY_MODEL_CONFIG) + + # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid + # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected + # behavior we disable that in this line. + if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): + self.base_model.config.pretraining_tp = 1 + + @property + def peft_config(self) -> dict[str, PeftConfig]: + return self.base_model.peft_config + + @property + def active_adapter(self) -> str: + return self.base_model.active_adapter + + @property + def active_adapters(self) -> list[str]: + return self.base_model.active_adapters + + def get_nb_trainable_parameters(self): + r""" + Returns the number of trainable parameters and number of all parameters in the model. + """ + # note: same as PeftModel.get_nb_trainable_parameters + trainable_params = 0 + all_param = 0 + for _, param in self.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + num_params = num_params * 2 + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model. + + Note: print_trainable_parameters() uses get_nb_trainable_parameters() which is different from + num_parameters(only_trainable=True) from huggingface/transformers. get_nb_trainable_parameters() returns + (trainable parameters, all parameters) of the Peft Model which includes modified backbone transformer model. + For techniques like LoRA, the backbone transformer model is modified in place with LoRA modules. However, for + prompt tuning, the backbone transformer model is unmodified. num_parameters(only_trainable=True) returns number + of trainable parameters of the backbone transformer model which can be different. + """ + # note: same as PeftModel.print_trainable_parameters + trainable_params, all_param = self.get_nb_trainable_parameters() + + print( + f"trainable params: {trainable_params:,d} || " + f"all params: {all_param:,d} || " + f"trainable%: {100 * trainable_params / all_param:.4f}" + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "base_model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.base_model, name) + + def forward(self, *args: Any, **kwargs: Any): + """ + Forward pass of the model. + """ + return self.base_model(*args, **kwargs) + + def generate(self, *args: Any, **kwargs: Any): + """ + Generate output. + """ + return self.base_model.generate(*args, **kwargs) + + @contextmanager + def disable_adapter(self): + """ + Disables the adapter module. + """ + try: + self.base_model.disable_adapter_layers() + yield + finally: + self.base_model.enable_adapter_layers() + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. + + > [!TIP] > Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training (training + is untested > and discouraged for PeftMixedModel in general). + """ + _check_config_compatible(peft_config) + + try: + self.peft_config[adapter_name] = peft_config + self.base_model.inject_adapter(self, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage) + except Exception: # something went wrong, roll back + if adapter_name in self.peft_config: + del self.peft_config[adapter_name] + raise + + self.set_modules_to_save(peft_config, adapter_name) + + def set_modules_to_save(self, peft_config: PeftConfig, adapter_name: str) -> None: + if (modules_to_save := getattr(peft_config, "modules_to_save", None)) is None: + return + + if self.modules_to_save is None: + self.modules_to_save = set(modules_to_save) + else: + self.modules_to_save.update(modules_to_save) + _set_trainable( + self, + adapter_name, + module_names=getattr(peft_config, "modules_to_save", None), + inference_mode=peft_config.inference_mode, + ) + + def set_adapter(self, adapter_name: Union[str, list[str]], inference_mode: bool = False) -> None: + """ + Sets the active adapter(s) for the model. + + Note that the order in which the adapters are applied during the forward pass may not be the same as the order + in which they are passed to this function. Instead, the order during the forward pass is determined by the + order in which the adapters were loaded into the model. The active adapters only determine which adapters are + active during the forward pass, but not the order in which they are applied. + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True) unless + inference_mode is True. + + Args: + adapter_name (str, list[str]): + The name(s) of the adapter(s) to set as active + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + if isinstance(adapter_name, str): + adapter_name = [adapter_name] + + mismatched = set(adapter_name) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + self.base_model.set_adapter(adapter_name, inference_mode=inference_mode) + _set_adapter(self, adapter_name, inference_mode=inference_mode) + + def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None: + if isinstance(adapter_name, str): + adapter_name = [adapter_name] + + mismatched = set(adapter_name) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + self.base_model.delete_adapter(adapter_name) + + def merge_and_unload(self, *args: Any, **kwargs: Any): + r""" + This method merges the adapter layers into the base model. This is needed if someone wants to use the base + model as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + return self.base_model.merge_and_unload(*args, **kwargs) + + def unload(self, *args: Any, **kwargs: Any): + """ + Gets back the base model by removing all the adapter modules without merging. This gives back the original base + model. + """ + return self.base_model.unload(*args, **kwargs) + + def get_layer_status(self): + raise TypeError(f"get_layer_status is not supported for {self.__class__.__name__}.") + + def get_model_status(self): + raise TypeError(f"get_model_status is not supported for {self.__class__.__name__}.") + + @classmethod + def _split_kwargs(cls, kwargs: dict[str, Any]): + return PeftModel._split_kwargs(kwargs) + + def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None: + return PeftModel._check_new_adapter_config(self, peft_config, is_trainable=is_trainable) + + def load_adapter(self, model_id: str, adapter_name: str, *args: Any, **kwargs: Any): + """ + Load a trained adapter into the model. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + torch_device (`str`, *optional*, defaults to None): + The device to load the adapter on. If `None`, the device will be inferred. + autocast_adapter_dtype (`bool`, *optional*, defaults to `True`): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter + weights using float16 and bfloat16 to float32, as this is typically required for stable training, and + only affect select PEFT tuners. + ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`): + Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + kwargs: (`optional`): + Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub. + """ + # the low_cpu_mem_usage option is handled through kwargs + output = PeftModel.load_adapter(self, model_id, adapter_name, *args, **kwargs) + # TODO: not quite clear why this is necessary but tests fail without it + self.set_adapter(self.active_adapters) + return output + + def create_or_update_model_card(self, output_dir: str): + raise NotImplementedError(f"Model card creation is not supported for {self.__class__.__name__} (yet).") + + def save_pretrained( + self, + save_directory: str, + safe_serialization: bool = False, + selected_adapters: Optional[list[str]] = None, + **kwargs: Any, + ): + raise NotImplementedError(f"Saving is not supported for {self.__class__.__name__} (yet).") + + @classmethod + def from_pretrained( + cls, + model: nn.Module, + model_id: str | os.PathLike, + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + **kwargs: Any, + ): + r""" + Instantiate a PEFT mixed model from a pretrained model and loaded PEFT weights. + + Note that the passed `model` may be modified inplace. + + Args: + model (`nn.Module`): + The model to be adapted. + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter to be loaded. This is useful for loading multiple adapters. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and use for + inference + config ([`~peft.PeftConfig`], *optional*): + The configuration object to use instead of an automatically loaded configuration. This configuration + object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already + loaded before calling `from_pretrained`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + kwargs: (`optional`): + Additional keyword arguments passed along to the specific PEFT configuration class. + """ + # note: adapted from PeftModel.from_pretrained + from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_MIXED_MODEL_MAPPING + + # load the config + if config is None: + hf_kwargs = { + "subfolder": kwargs.get("subfolder", None), + "revision": kwargs.get("revision", None), + "cache_dir": kwargs.get("cache_dir", None), + "token": kwargs.get("token", None), + } + if use_auth_token := kwargs.get("use_auth_token", None): + hf_kwargs["use_auth_token"] = use_auth_token + config = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig._get_peft_type(model_id, **hf_kwargs)].from_pretrained( + model_id, **kwargs + ) + elif isinstance(config, PeftConfig): + config.inference_mode = not is_trainable + else: + raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") + + # note: this is different from PeftModel.from_pretrained + if config.peft_type not in PEFT_TYPE_TO_MIXED_MODEL_MAPPING: + raise ValueError(f"Adapter of type {config.peft_type} is not supported for mixed models.") + + if (getattr(model, "hf_device_map", None) is not None) and len( + set(model.hf_device_map.values()).intersection({"cpu", "disk"}) + ) > 0: + remove_hook_from_submodules(model) + + if config.is_prompt_learning and is_trainable: + # note: should not be possible to reach, but just in case + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + config.inference_mode = not is_trainable + + # note: this is different from PeftModel.from_pretrained, we always return a PeftMixedModel + model = cls(model, config, adapter_name) + # the low_cpu_mem_usage option is handled through kwargs + model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs) + return model diff --git a/peft/src/peft/optimizers/__init__.py b/peft/src/peft/optimizers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0e8821f45db8e2dcf8f26fd8b38f8f90a28f5c09 --- /dev/null +++ b/peft/src/peft/optimizers/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .lorafa import create_lorafa_optimizer +from .loraplus import create_loraplus_optimizer + + +__all__ = ["create_lorafa_optimizer", "create_loraplus_optimizer"] diff --git a/peft/src/peft/optimizers/lorafa.py b/peft/src/peft/optimizers/lorafa.py new file mode 100644 index 0000000000000000000000000000000000000000..61e331ed100174b4ac645c08a07b1619328f17d5 --- /dev/null +++ b/peft/src/peft/optimizers/lorafa.py @@ -0,0 +1,257 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module contains the implementation of the LoRA-FA optimizer. +""" + +from __future__ import annotations + +import math +from collections.abc import Iterable +from typing import Callable + +import torch +import torch.nn as nn +from accelerate.utils.imports import is_bf16_available +from torch import autocast +from torch.optim import Optimizer + +from ..peft_model import PeftModel +from ..utils.other import infer_device + + +class LoraFAOptimizer(Optimizer): + """ + Implements the LoRA-FA optimizer designed specifically for training Low-Rank Adaptation (LoRA) parameters + efficiently. Note that LoraFAOptimizer is based on adamw-hf in transformers, with only LoRA part modified. Without + LoRA it will fall back to adamw-hf. + + Args: + params (Iterable[nn.parameter.Parameter]): Parameters to optimize. + lr (float, optional): Learning rate (default: 1e-3). + betas (Tuple[float, float], optional): + Coefficients for computing running averages of gradient and squared gradient (default: (0.9, 0.999)). + eps (float, optional): Term added to denominator to improve numerical stability (default: 1e-6). + weight_decay (float, optional): Weight decay (L2 penalty) (default: 0.0). + correct_bias (bool, optional): Whether to apply bias correction as in original Adam (default: True). + + Args in sub-function step: + closure (Callable, optional): A closure that reevaluates the model and returns the loss. + + Reference: + - LoRA-FA: https://huggingface.co/papers/2308.03303 + """ + + def __init__( + self, + params: Iterable[nn.parameter.Parameter], + lr: float = 1e-3, + betas: tuple[float, float] = (0.9, 0.999), + eps: float = 1e-6, + weight_decay: float = 0.0, + correct_bias: bool = True, + ): + if lr < 0.0: + raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0") + if not 0.0 <= betas[0] < 1.0: + raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)") + if not 0.0 <= betas[1] < 1.0: + raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)") + if not 0.0 <= eps: + raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0") + defaults = { + "lr": lr, + "betas": betas, + "eps": eps, + "weight_decay": weight_decay, + "correct_bias": correct_bias, + } + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure: Callable = None): + """ + Performs a single optimization step. + + Arguments: + closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + scaling_factor = group["scaling_factor"] + param_list = [] + name_list = [] + for p, n in zip(group["params"], group["names"]): + # Skip non-lora no-grad module, since we need lora_A which is no-grad. + if "lora" not in n and p.grad is None: + continue + grad = p.grad + + if "lora" in n: + param_list.append(p) + name_list.append(n) + if len(param_list) == 2: + name = n[: n.find("lora")] + "lora" + elif len(param_list) == 1: + continue + else: + name = n + # param_list contains a pair of A and B adapters + # i.e., param_list -> [A,B] + + state = self.state[name] + # State initialization + if len(state) == 0: + if len(param_list) == 2: + state["step"] = 0 + # Exponential moving average of gradient values + state["exp_avg_B"] = torch.zeros_like(param_list[1]) + # Exponential moving average of squared gradient values + state["exp_avg_sq_B"] = torch.zeros_like(param_list[1]) + else: + state["step"] = 0 + # Exponential moving average of gradient values + state["exp_avg"] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state["exp_avg_sq"] = torch.zeros_like(p) + + # Below is the LoRA-FA part + # 1. In this part, we optimize the gradient of B as: + # g^B = \left(\frac{r}{\alpha}\right)^2 (A^\top A)^{-1} g_{\text{LoRA-FA}}^B + # to min the func as described below: + # \min_{g^B} \|\hat{g}_\text{LoRA-FA} - g\|_F^2 + # 2. After the gradient of B is ready, update the optimizer state + if len(param_list) == 2: + A = param_list[0] + B = param_list[1] + grad_B_orin = B.grad + + # projection + delta = 1e-8 + + # computing the inverse matrix + AA_T = A @ A.T + AA_T_inv = torch.linalg.pinv(AA_T + delta * torch.eye(A.shape[0]).to(A.device)) + + device_type = infer_device() + + if is_bf16_available(): + with autocast(device_type=device_type, dtype=torch.bfloat16): + grad_B = (1 / scaling_factor**2) * (grad_B_orin @ AA_T_inv) + else: + grad_B = (1 / scaling_factor**2) * (grad_B_orin @ AA_T_inv) + + if grad_B.dtype != B.grad.dtype: + grad_B = grad_B.to(B.grad.dtype) + + exp_avg_B, exp_avg_sq_B = state["exp_avg_B"], state["exp_avg_sq_B"] + beta1, beta2 = group["betas"] + state["step"] += 1 + exp_avg_B.mul_(beta1).add_(grad_B, alpha=(1.0 - beta1)) + exp_avg_sq_B.mul_(beta2).addcmul_(grad_B, grad_B, value=1.0 - beta2) + + denom_B = exp_avg_sq_B.sqrt().add_(group["eps"]) + step_size = group["lr"] + if group["correct_bias"]: # No bias correction for Bert + bias_correction1 = 1.0 - beta1 ** state["step"] + bias_correction2 = 1.0 - beta2 ** state["step"] + step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 + B.addcdiv_(exp_avg_B, denom_B, value=-step_size) + if group["weight_decay"] > 0.0: + B.add_(B, alpha=(-group["lr"] * group["weight_decay"])) + param_list = [] + name_list = [] + + # Below is the original AdamW + else: + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] + + state["step"] += 1 + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1)) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) + denom = exp_avg_sq.sqrt().add_(group["eps"]) + + step_size = group["lr"] + if group["correct_bias"]: # No bias correction for Bert + bias_correction1 = 1.0 - beta1 ** state["step"] + bias_correction2 = 1.0 - beta2 ** state["step"] + step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 + + p.addcdiv_(exp_avg, denom, value=-step_size) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + # Add weight decay at the end (fixed version) + if group["weight_decay"] > 0.0: + p.add_(p, alpha=(-group["lr"] * group["weight_decay"])) + + return loss + + +def create_lorafa_optimizer( + model: PeftModel, r: int, lora_alpha: int, lr: float, weight_decay: float = 0.0, use_rslora: bool = False +) -> Optimizer: + """ + Helper function to instantiate a lorafa optimizer specifically configured for a given model using the LoRA method. + + This function will: + - Disable gradient updates for the "lora_A" parameters (these are typically frozen during LoRA training). + - Compute the scaling factor based on provided `lora_alpha` and rank `r` for proper gradient projection. + - Create and configure parameter groups for the optimizer including specified learning rate, weight decay, and + additional optimizer options. + + For hyper-params, LoRA-FA uses the same hyper-params as AdamW, except for the LoRA hyper-params (r, lora_alpha, + use_rslora). One can always use the same hyper-params such as lr and weight_decay, as AdamW in LoRA tuning. + + Args: + model (PeftModel): The model containing LoRA-adapted parameters. + r (int): Rank of the LoRA decomposition. + lora_alpha (int): Scaling factor for LoRA parameterization. + lr (float): Learning rate for optimizer updates. + weight_decay (float): Weight decay for AdamW. + use_rslora (bool): + whether to use rslora. In rslora, the lora scaling factor becomes to lora_alpha / math.sqrt(r) instead of + lora_alpha / r. + + Returns: + Optimizer: Configured lorafa optimizer instance ready for training. + """ + for name, param in model.named_parameters(): + if "lora_A" in name: + param.requires_grad_(False) + lora_scaling = lora_alpha / math.sqrt(r) if use_rslora else lora_alpha / r + param_groups = [ + { + "params": model.parameters(), + "lr": lr, + "names": [name for name, _ in model.named_parameters()], + "scaling_factor": lora_scaling, + "betas": (0.9, 0.999), + "weight_decay": weight_decay, + } + ] + return LoraFAOptimizer(param_groups) diff --git a/peft/src/peft/optimizers/loraplus.py b/peft/src/peft/optimizers/loraplus.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ecae770d5acab4215479141f2db1d17e42da81 --- /dev/null +++ b/peft/src/peft/optimizers/loraplus.py @@ -0,0 +1,121 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module contains the implementation of the LoraPlus optimizer. +""" + +from __future__ import annotations + +from operator import attrgetter + +import torch.nn as nn +from torch.optim import Optimizer +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS +from transformers.trainer_pt_utils import get_parameter_names + +from ..peft_model import PeftModel +from ..tuners.lora.layer import Embedding + + +def create_loraplus_optimizer( + model: PeftModel, optimizer_cls: type[Optimizer], *, lr: float, loraplus_lr_ratio: float, **kwargs +) -> Optimizer: + """ + Creates a LoraPlus optimizer. + + Efficient Low Rank Adaptation of Large Models: https://huggingface.co/papers/2402.12354 + + Reference: https://github.com/nikhil-ghosh-berkeley/loraplus/ + + Args: + model (`torch.nn.Module`): The model to be optimized. + optimizer_cls (`torch.optim.Optimizer`): The optimizer class to be used. + lr (`float`): The learning rate to be used for the optimizer. + loraplus_lr_ratio (`float`): + The ratio of learning ηB/ηA where ηA (lr) is passed in as the optimizer learning rate. Should be ≥1. Should + be set in tandem with the optimizer learning rate (lr); should be larger when the task is more difficult + and the model needs to update its features to learn well. In this case, it helps to make the learning rate + slightly smaller (e.g., by a factor of 2) than typical vanilla LoRA learning rates + loraplus_lr_embedding (optional `float`): + If LoRA modules are added to embedding layers your can specify a different learning rate for them. Default + value 1e-6. + kwargs (`dict`): Additional keyword arguments to be passed to the optimizer. + + Returns: + `torch.optim.Optimizer`: An instance of the specified optimizer class configured with the model's parameters + organized into groups with custom learning rates. + """ + + decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + param_groups = { + "groupA": {}, + "groupB": {}, + "groupB_no_decay": {}, + "embedding": {}, + } + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + module = attrgetter(name)(model) + if isinstance(module, Embedding): + param_groups["embedding"][name] = param + elif "lora_B" in name or param.ndim == 1: + if name in decay_parameters: + param_groups["groupB"][name] = param + else: + param_groups["groupB_no_decay"][name] = param + else: + param_groups["groupA"][name] = param + + kwargs["lr"] = lr + loraplus_weight_decay = kwargs.pop("loraplus_weight_decay", 0.0) + loraplus_lr_embedding = kwargs.pop("loraplus_lr_embedding", 1e-6) + + optimizer_grouped_parameters = [ + { + "params": list(param_groups["groupA"].values()), + "weight_decay": loraplus_weight_decay, + "lr": lr, + }, + { + "params": list(param_groups["embedding"].values()), + "weight_decay": loraplus_weight_decay, + "lr": loraplus_lr_embedding, + }, + { + "params": list(param_groups["groupB"].values()), + "weight_decay": loraplus_weight_decay, + "lr": lr * loraplus_lr_ratio, + }, + { + "params": list(param_groups["groupB_no_decay"].values()), + "weight_decay": 0.0, + "lr": lr * loraplus_lr_ratio, + }, + ] + + optimizer = optimizer_cls(optimizer_grouped_parameters, **kwargs) + eight_bit_names = ["Adam8bit", "AdamW8bit", "PagedAdam8bit", "PagedAdamW8bit"] + if optimizer_cls.__name__ in eight_bit_names: + import bitsandbytes + + manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + for module in model.modules(): + if isinstance(module, nn.Embedding): + manager.register_module_override(module, "weight", {"optim_bits": 32}) + return optimizer diff --git a/peft/src/peft/peft_model.py b/peft/src/peft/peft_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7e6364169a86d4fab988b9f2b7739d628dec01 --- /dev/null +++ b/peft/src/peft/peft_model.py @@ -0,0 +1,3311 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections +import copy +import inspect +import os +import warnings +from collections.abc import Sequence +from contextlib import contextmanager, nullcontext +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Literal, Optional, Union + +import packaging.version +import torch +import transformers +from accelerate import dispatch_model, infer_auto_device_map +from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules +from accelerate.utils import get_balanced_memory, named_module_tensors +from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download +from safetensors import safe_open +from safetensors.torch import save_file as safe_save_file +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel +from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput +from transformers.utils import PushToHubMixin + +from peft.tuners.lora.variants import get_alora_offsets_for_forward, get_alora_offsets_for_generate +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import AuxiliaryTrainingWrapper +from peft.utils.constants import DUMMY_MODEL_CONFIG +from peft.utils.integrations import init_empty_weights +from peft.utils.other import TrainableTokensWrapper, create_attention_mask, set_additional_trainable_modules + +from . import __version__ +from .config import PeftConfig +from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING, PEFT_TYPE_TO_TUNER_MAPPING +from .utils import ( + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + WEIGHTS_NAME, + PeftType, + TaskType, + _get_batch_size, + _prepare_prompt_learning_config, + _set_adapter, + _set_trainable, + get_peft_model_state_dict, + id_tensor_storage, + infer_device, + load_peft_weights, + map_cache_to_layer_device_map, + set_peft_model_state_dict, + shift_tokens_right, +) + + +class PeftModel(PushToHubMixin, torch.nn.Module): + """ + Base model encompassing various Peft methods. + + Args: + model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft. + peft_config ([`PeftConfig`]): The configuration of the Peft model. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading loading process. + + > [!TIP] > Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training. + + **Attributes**: + - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft. + - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model. + - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when + saving the model. + - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if + using [`PromptLearningConfig`]. + - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if + using [`PromptLearningConfig`]. + - **transformer_backbone_name** (`str`) -- The name of the transformer + backbone in the base model if using [`PromptLearningConfig`]. + - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone + in the base model if using [`PromptLearningConfig`]. + """ + + def __init__( + self, + model: PreTrainedModel, + peft_config: PeftConfig, + adapter_name: str = "default", + autocast_adapter_dtype: bool = True, + low_cpu_mem_usage: bool = False, + ) -> None: + super().__init__() + self.active_adapter = adapter_name + self.peft_type = peft_config.peft_type + # These args are special PEFT arguments that users can pass. They need to be removed before passing them to + # forward. + self.special_peft_forward_args = {"adapter_names", "alora_offsets"} + + self._is_prompt_learning = peft_config.is_prompt_learning + if self._is_prompt_learning: + self._peft_config = {adapter_name: peft_config} + self.base_model = model + self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + else: + self._peft_config = None + cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type] + ctx = init_empty_weights if low_cpu_mem_usage else nullcontext + with ctx(): + self.base_model = cls(model, {adapter_name: peft_config}, adapter_name) + + if hasattr(self.base_model, "_cast_adapter_dtype"): + self.base_model._cast_adapter_dtype( + adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype + ) + + if getattr(model, "is_gradient_checkpointing", True): + model = self.prepare_model_for_gradient_checkpointing(model) + + # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid + # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected + # behavior we disable that in this line. + if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): + self.base_model.config.pretraining_tp = 1 + + @property + def peft_config(self) -> dict[str, PeftConfig]: + if self._is_prompt_learning: + return self._peft_config + return self.base_model.peft_config + + @property + def active_adapters(self) -> list[str]: + try: + adapters = self.base_model.active_adapters + if not isinstance(adapters, list): + # Base model is probably a transformers model, see: + # https://github.com/huggingface/transformers/pull/30790#issuecomment-2253808249 + # Unfortunately, transformers models also have an active_adapters method but it's 1) not a property and + # 2) calling it fails because the base model (usually) has no loaded adapter. The base model can be a + # transformers model for prompt learning, where the base model is not wrapped in a LoraModel or similar. + adapters = self.active_adapter + if isinstance(adapters, str): + adapters = [adapters] + except AttributeError: + adapters = self.active_adapter + if isinstance(adapters, str): + adapters = [adapters] + return adapters + + @peft_config.setter + def peft_config(self, value: dict[str, PeftConfig]): + if self._is_prompt_learning: + self._peft_config = value + else: + self.base_model.peft_config = value + + def save_pretrained( + self, + save_directory: str, + safe_serialization: bool = True, + selected_adapters: Optional[list[str]] = None, + save_embedding_layers: Union[str, bool] = "auto", + is_main_process: bool = True, + path_initial_model_for_weight_conversion: Optional[str] = None, + **kwargs: Any, + ) -> None: + r""" + This function saves the adapter model and the adapter configuration files to a directory, so that it can be + reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`] + method. + + Args: + save_directory (`str`): + Directory where the adapter model and configuration files will be saved (will be created if it does not + exist). + safe_serialization (`bool`, *optional*): + Whether to save the adapter files in safetensors format, defaults to `True`. + selected_adapters (`List[str]`, *optional*): + A list of adapters to be saved. If `None`, will default to all adapters. + save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`): + If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common + embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. + and automatically sets the boolean flag. This only works for 🤗 transformers models. + is_main_process (`bool`, *optional*): + Whether the process calling this is the main process or not. Will default to `True`. Will not save the + checkpoint if not on the main process, which is important for multi device setups (e.g. DDP). + path_initial_model_for_weight_conversion (`str, *optional*`): + The path to the initialized adapter, which is obtained after initializing the model with + PiSSA/CorDA/OLoRA and before performing any training. When `path_initial_model_for_weight_conversion` + is not None, the difference in adapter before and after fine-tuning is calculated. This difference can + be represented as the parameters of a standard LoRA adapter. Using this converted adapter does not + require changes to the base model, thus conveniently allowing the use of multiple PiSSA/CorDA/OLoRA + adapters with LoRA adapters, and the activation or deactivation of any adapters. Note that this + conversion is not supported if `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the `push_to_hub` method. + + """ + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + + if selected_adapters is None: + selected_adapters = list(self.peft_config.keys()) + else: + if any( + selected_adapter_name not in list(self.peft_config.keys()) + for selected_adapter_name in selected_adapters + ): + raise ValueError( + f"You passed an invalid `selected_adapters` arguments, current supported adapter names are" + f" {list(self.peft_config.keys())} - got {selected_adapters}." + ) + + def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs): + if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern): + msg = ( + "Passing `path_initial_model_for_weight_conversion` to `save_pretrained` is not supported when " + "using `rank_pattern` or `alpha_pattern` at the same time as `use_rslora=True`." + ) + raise ValueError(msg) + + if not any( + str(peft_config.init_lora_weights).lower().startswith(prefix) + for prefix in ["pissa", "corda", "olora", "true"] + ): + warnings.warn( + "`path_initial_model_for_weight_conversion` only works for converting a PiSSA/CorDA/OLoRA adapter to " + "a LoRA adapter" + ) + initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion) + try: + self.load_adapter( + os.path.dirname(path_initial_model_for_weight_conversion), + subfolder=initial_adapter_name, + adapter_name=initial_adapter_name, + ) + is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa") + is_corda = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "corda" + is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora" + if is_pissa or is_corda or is_olora: + raise ValueError( + "The `init_lora_weights` parameter of the initial adapter should be set to `True`. " + "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the " + "residual model." + ) + output_state_dict = self.base_model.subtract_mutated_init( + output_state_dict, initial_adapter_name, kwargs + ) + finally: + self.delete_adapter(initial_adapter_name) + return output_state_dict + + if is_main_process: + os.makedirs(save_directory, exist_ok=True) + self.create_or_update_model_card(save_directory) + + for adapter_name in selected_adapters: + peft_config = self.peft_config[adapter_name] + # save only the trainable weights + output_state_dict = get_peft_model_state_dict( + self, + state_dict=kwargs.get("state_dict", None), + adapter_name=adapter_name, + save_embedding_layers=save_embedding_layers, + ) + output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory + os.makedirs(output_dir, exist_ok=True) + + if is_main_process and safe_serialization: + # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134 + # Safetensors does not allow tensor aliasing. + # We're going to remove aliases before saving + ptrs = collections.defaultdict(list) + for name, tensor in output_state_dict.items(): + # Sometimes in the state_dict we have non-tensor objects. + # e.g. in bitsandbytes we have some `str` objects in the state_dict + if isinstance(tensor, torch.Tensor): + ptrs[id_tensor_storage(tensor)].append(name) + else: + # In the non-tensor case, fall back to the pointer of the object itself + ptrs[id(tensor)].append(name) + + # These are all the pointers of shared tensors. + shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} + + for _, names in shared_ptrs.items(): + # Here we just clone the shared tensors to avoid tensor aliasing which is + # not supported in safetensors. + for shared_tensor_name in names[1:]: + output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone() + if path_initial_model_for_weight_conversion is not None: + peft_config = copy.deepcopy(peft_config) + peft_config.init_lora_weights = True + peft_config.save_pretrained(path_initial_model_for_weight_conversion) + output_state_dict = save_mutated_as_lora( + peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs + ) + safe_save_file( + output_state_dict, + os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME), + metadata={"format": "pt"}, + ) + elif is_main_process: + if path_initial_model_for_weight_conversion is not None: + peft_config = copy.deepcopy(peft_config) + peft_config.init_lora_weights = True + peft_config.save_pretrained(path_initial_model_for_weight_conversion) + output_state_dict = save_mutated_as_lora( + peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs + ) + torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + + # save the config and change the inference mode to `True` + if peft_config.base_model_name_or_path is None: + peft_config.base_model_name_or_path = ( + self.base_model.__dict__.get("name_or_path", None) + if peft_config.is_prompt_learning + else self.base_model.model.__dict__.get("name_or_path", None) + ) + inference_mode = peft_config.inference_mode + peft_config.inference_mode = True + + if peft_config.task_type is None: + # deal with auto mapping + base_model_class = self._get_base_model_class( + is_prompt_tuning=peft_config.is_prompt_learning, + ) + parent_library = base_model_class.__module__ + + auto_mapping_dict = { + "base_model_class": base_model_class.__name__, + "parent_library": parent_library, + } + else: + auto_mapping_dict = None + + if is_main_process: + if path_initial_model_for_weight_conversion is not None: + peft_config.init_lora_weights = True + peft_config.r *= 2 + if not peft_config.use_rslora: + peft_config.lora_alpha *= 2 + else: + # with rslora, we have scaling = alpha / sqrt(r), we thus adjust alpha to keep the same scaling + peft_config.lora_alpha *= 2**0.5 + + if peft_config.rank_pattern: + peft_config.rank_pattern = {key: 2 * val for key, val in peft_config.rank_pattern.items()} + if peft_config.alpha_pattern: + peft_config.alpha_pattern = {key: 2 * val for key, val in peft_config.alpha_pattern.items()} + + peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict) + peft_config.inference_mode = inference_mode + + @classmethod + def from_pretrained( + cls, + model: torch.nn.Module, + model_id: Union[str, os.PathLike], + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + autocast_adapter_dtype: bool = True, + ephemeral_gpu_offload: bool = False, + low_cpu_mem_usage: bool = False, + key_mapping: Optional[dict[str, str]] = None, + **kwargs: Any, + ) -> PeftModel: + r""" + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + + Note that the passed `model` may be modified inplace. + + Args: + model ([`torch.nn.Module`]): + The model to be adapted. For 🤗 Transformers models, the model should be initialized with the + [`~transformers.PreTrainedModel.from_pretrained`]. + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter to be loaded. This is useful for loading multiple adapters. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + config ([`~peft.PeftConfig`], *optional*): + The configuration object to use instead of an automatically loaded configuration. This configuration + object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already + loaded before calling `from_pretrained`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Only relevant for specific adapter types. + ephemeral_gpu_offload (`bool`, *optional*): + Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. This is + useful when parts of the model and/or components (such as adapters) are kept in CPU memory until they + are needed. Rather than perform expensive operations on small data, the data is transferred to the GPU + on-demand, the operation(s) performed, and the results moved back to CPU memory. This brings a slight + momentary VRAM overhead but gives orders of magnitude speedup in certain cases. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + torch_device (`str`, *optional*, defaults to None): + The device to load the adapter on. If `None`, the device will be inferred. + key_mapping (dict, *optional*, defaults to None) + Extra mapping of PEFT `state_dict` keys applied before loading the `state_dict`. When this mapping is + applied, the PEFT-specific `"base_model.model"` prefix is removed beforehand and the adapter name (e.g. + `"default"`) is not inserted yet. Only pass this argument if you know what you're doing. + kwargs: (`optional`): + Additional keyword arguments passed along to the specific PEFT configuration class. + """ + from .auto import MODEL_TYPE_TO_PEFT_MODEL_MAPPING + from .tuners import XLoraConfig, XLoraModel + + # load the config + if config is None: + hf_kwargs = { + "subfolder": kwargs.get("subfolder", None), + "revision": kwargs.get("revision", None), + "cache_dir": kwargs.get("cache_dir", None), + "token": kwargs.get("token", None), + } + if use_auth_token := kwargs.get("use_auth_token", None): + hf_kwargs["use_auth_token"] = use_auth_token + config = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig._get_peft_type(model_id, **hf_kwargs)].from_pretrained( + model_id, **kwargs + ) + elif isinstance(config, PeftConfig): + config.inference_mode = not is_trainable + else: + raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") + + # See discussion in https://github.com/huggingface/transformers/pull/38627 + # Some transformers models can have a _checkpoint_conversion_mapping dict that is used to map state_dicts + # stemming from updated model architectures so that they still correspond to the initial architecture. When + # loading a PEFT state_dict created with the initial architecture on a model with the new architecture, we need + # to map it too according to the same rules. Note that we skip prompt learning methods. This is because they + # don't have the "base_model.model." prefix, which we need to remove before mapping. Instead just using + # "base_model.". This could be fine, we could only remove "base_model.", However, the subsequent sub-module + # could also be called "model", resulting in what looks like "base_model.model.". To avoid this confusion, we + # skip prompt learning. Since it applies itself directly to the pre-trained model (unlike LoRA et al that target + # sub-modules), skipping should be fine. + if (key_mapping is None) and (not config.is_prompt_learning): + key_mapping = getattr(model, "_checkpoint_conversion_mapping", {}) + + # Runtime configuration, if supported + if hasattr(config, "runtime_config"): + config.runtime_config.ephemeral_gpu_offload = ephemeral_gpu_offload + else: + if ephemeral_gpu_offload: + warnings.warn("Ephemeral GPU offloading is not supported for this model. Ignoring.") + + if hasattr(model, "hf_device_map"): + weight_map = dict(named_module_tensors(model, recurse=True)) + + # recreate the offload_index for disk-offloaded modules: we need to know the location in storage of each weight + # before the offload hook is removed from the model + disk_modules = set() + index = None + for name, module in model.named_modules(): + if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "original_devices"): + if hasattr(module._hf_hook.weights_map, "dataset"): + index = module._hf_hook.weights_map.dataset.index + for key in module._hf_hook.original_devices.keys(): + if module._hf_hook.original_devices[key] == torch.device("meta"): + disk_modules.add(str(name) + "." + str(key)) + + if disk_modules and not kwargs.get("use_safetensors", True): + raise ValueError("Disk offloading currently only supported for safetensors") + + if index: + offload_index = { + p: { + "safetensors_file": index[p]["safetensors_file"], + "weight_name": p, + "dtype": str(weight_map[p].dtype).replace("torch.", ""), + } + for p in weight_map.keys() + if p in disk_modules + } + kwargs["offload_index"] = offload_index + + if (getattr(model, "hf_device_map", None) is not None) and len( + set(model.hf_device_map.values()).intersection({"cpu", "disk"}) + ) > 0: + remove_hook_from_submodules(model) + + if config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + config.inference_mode = not is_trainable + if isinstance(getattr(model, "base_model", None), XLoraModel): + if not isinstance(config, XLoraConfig): + raise TypeError(f"Expected 'XLoraConfig', got '{type(config)}' instead.") + if "adapters" in kwargs: + config.adapters = kwargs["adapters"] + else: + # If the path is on HF hub, then we get the adapter names to create a subfolders list which tells + # `load_adapter` where the adapters are. + if not os.path.exists(model_id): + s = HfFileSystem() + + # The names of the adapters which must be in folders + adapter_names = [ + file["name"][len(model_id) + 1 :] for file in s.ls(model_id) if file["type"] == "directory" + ] + # Prepare a dict of adapter paths, which really just point to the hf id; we will use the subfolders + adapter_paths = {} + for adapter_name in adapter_names: + adapter_paths[adapter_name] = os.path.join(model_id, model_id) + config.adapters = adapter_paths + config._subfolders = adapter_names + else: + if "adapters" not in kwargs: + raise ValueError("If model_id is a local path, then `adapters` must be passed in kwargs.") + + if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys(): + model = cls( + model, + config, + adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + else: + model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type]( + model, + config, + adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + load_result = model.load_adapter( + model_id, + adapter_name, + is_trainable=is_trainable, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + key_mapping=key_mapping, + **kwargs, + ) + + # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel + # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint + missing_keys = [ + k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k + ] + if missing_keys: + # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite + # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT + # keys from the missing keys, this gives no false positives. + + # careful: if the wording of the warning is changed, adjust the unit tests accordingly! + warn_message = f"Found missing adapter keys while loading the checkpoint: {missing_keys}." + + prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(config.peft_type) + if prefix and adapter_name in prefix: + warn_message = ( + f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. " + "This could be the potential reason for missing adapter keys. " + ) + warn_message + + warnings.warn(warn_message) + + return model + + def _setup_prompt_encoder(self, adapter_name: str): + config = self.peft_config[adapter_name] + if not hasattr(self, "prompt_encoder"): + self.prompt_encoder = torch.nn.ModuleDict({}) + self.prompt_tokens = {} + transformer_backbone = None + for name, module in self.base_model.named_children(): + for param in module.parameters(): + param.requires_grad = False + if isinstance(module, PreTrainedModel): + # Make sure to freeze Tranformers model + if transformer_backbone is None: + transformer_backbone = module + self.transformer_backbone_name = name + if transformer_backbone is None: + transformer_backbone = self.base_model + + if config.num_transformer_submodules is None: + config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 + + # determine the word embeddings + word_embeddings = None + try: + # First try to find the word embeddings based on the module name, this should work for models like Bert, + # Roberta, Deberta, etc. + word_embeddings = self.base_model.get_submodule("embeddings.word_embeddings") + except AttributeError: + pass + + if word_embeddings is None: + # Word embeddings could not be determined. Next try to guess them by checking which parameter has the size + # of the vocab. + for named_param, value in list(transformer_backbone.named_parameters()): + # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape + # [0] the actual unsharded shape is stored in "ds_shape" attribute special handling is needed in case + # the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig has been called before + # For reference refer to issue: https://github.com/huggingface/peft/issues/996 + deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None) + + # Handle VLM case with separate text and vision configs + if hasattr(self.base_model.config, "get_text_config"): + vocab_size = self.base_model.config.get_text_config().vocab_size + # below: for older transformers versions before get_text_config was added + elif "text_config" in self.base_model.config: + vocab_size = self.base_model.config.text_config.vocab_size + else: + vocab_size = self.base_model.config.vocab_size + + if value.shape[0] == vocab_size or ( + deepspeed_distributed_tensor_shape is not None + and deepspeed_distributed_tensor_shape[0] == vocab_size + ): + word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", "")) + break + + self.word_embeddings = word_embeddings + model_cls = PEFT_TYPE_TO_TUNER_MAPPING[config.peft_type] + + if config.peft_type in (PeftType.PROMPT_TUNING, PeftType.MULTITASK_PROMPT_TUNING, PeftType.CPT): + prompt_encoder = model_cls(config, self.word_embeddings) + elif config.peft_type == PeftType.P_TUNING: + prompt_encoder = model_cls(config) + elif config.peft_type == PeftType.PREFIX_TUNING: + # prefix tuning now uses Cache but that won't work with gradient checkpointing + if any(getattr(module, "gradient_checkpointing", False) for module in self.get_base_model().modules()): + raise ValueError("Prefix tuning does not work with gradient checkpointing.") + prompt_encoder = model_cls(config) + else: + raise ValueError("Not supported") + + prompt_encoder = prompt_encoder.to(self.device) + self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder})) + self.prompt_tokens[adapter_name] = torch.arange( + config.num_virtual_tokens * config.num_transformer_submodules + ).long() + + def prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel): + r""" + Prepares the model for gradient checkpointing if necessary + """ + self._prepare_model_for_gradient_checkpointing(model) + + def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel): + if not ( + getattr(model, "is_loaded_in_8bit", False) + or getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_quantized", False) + ): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + elif hasattr(model, "get_input_embeddings"): + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + return model + + def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor: + """ + Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning + method. + """ + prompt_encoder = self.prompt_encoder[adapter_name] + prompt_tokens = ( + self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device) + ) + peft_type = self.peft_config[adapter_name].peft_type + if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens] + + if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompt_embedding_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_type] + prompt_embeddings = super(prompt_embedding_cls, prompt_encoder).forward(prompt_tokens) + else: + prompt_embeddings = prompt_encoder(prompt_tokens) + + return prompt_embeddings[0].detach().cpu() + + def get_prompt( + self, batch_size: int, task_ids: Optional[torch.Tensor] = None, max_cache_len: Optional[int] = None + ) -> torch.Tensor: + """ + Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method. + """ + peft_config = self.active_peft_config + prompt_encoder = self.prompt_encoder[self.active_adapter] + prompt_tokens = ( + self.prompt_tokens[self.active_adapter] + .unsqueeze(0) + .expand(batch_size, -1) + .to(prompt_encoder.embedding.weight.device) + ) + if peft_config.peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens] + if peft_config.inference_mode: + past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1) + else: + past_key_values = prompt_encoder(prompt_tokens) + if self.base_model_torch_dtype is not None: + past_key_values = past_key_values.to(self.base_model_torch_dtype) + past_key_values = past_key_values.view( + batch_size, + peft_config.num_virtual_tokens, + peft_config.num_layers * 2, + peft_config.num_attention_heads, + peft_config.token_dim // peft_config.num_attention_heads, + ) + if peft_config.num_transformer_submodules == 2: + past_key_values = torch.cat([past_key_values, past_key_values], dim=2) + + # Transpose: 2 x [num_layers, batch_size, num_heads, num_virtual_tokens, head_dim] + past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split( + peft_config.num_transformer_submodules * 2 + ) + + base_model = self.get_base_model() + model_config = getattr(base_model, "config", None) + model_type = getattr(model_config, "model_type", "") + if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None: + post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type] + past_key_values = post_process_fn(past_key_values) + elif ("gemma2" in model_type) or ("gemma3_text" in model_type): + # TODO: remove this logic once transformers < 4.56 is dropped + transformers_lt_4_56 = packaging.version.parse(transformers.__version__) < packaging.version.parse( + "4.56.0.dev0" + ) + # Gemma2 and Gemma3 only support HybridCache (which does not have the from_legacy_cache method) + if transformers_lt_4_56 and ((max_cache_len is None) or (max_cache_len == -1)): + raise ValueError( + "max_cache_len is missing but it should have been passed. Something went wrong, please open an " + "issue on GitHub with a reproducer: https://github.com/huggingface/peft/issues" + ) + base_config = base_model.config + if hasattr(base_config, "get_text_config"): + base_config = base_config.get_text_config() + if transformers_lt_4_56: + # HybridCache is deprecated, and will be removed in 4.60.0 + # see https://github.com/huggingface/transformers/pull/40276 + from transformers import HybridCache + + new_cache = HybridCache( + config=base_config, + max_batch_size=batch_size, + max_cache_len=max_cache_len, + dtype=past_key_values[0].dtype, + device=past_key_values[0].device, + ) + else: + # transformers 4.56+ uses DynamicCache for gemma + new_cache = DynamicCache(config=base_config) + cache_position = torch.arange(peft_config.num_virtual_tokens, device=past_key_values[0].device) + for layer_idx in range(peft_config.num_layers): + key_states, value_states = past_key_values[0][layer_idx], past_key_values[1][layer_idx] + new_cache.update( + key_states, value_states, layer_idx, cache_kwargs={"cache_position": cache_position} + ) + past_key_values = new_cache + elif peft_config.num_transformer_submodules == 1: + # Dont' apply this to encoder-decoder models and not to models requiring special processing. + # TODO: remove from_legacy_cache once transformers < 4.56 is dropped + transformers_lt_4_56 = packaging.version.parse(transformers.__version__) < packaging.version.parse( + "4.56.0.dev0" + ) + if transformers_lt_4_56: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + else: + past_key_values = DynamicCache(past_key_values) + + elif (peft_config.num_transformer_submodules == 2) and getattr( + self.base_model, "_supports_cache_class", True + ): + # Dont' apply this to encoder-decoder models that don't support new Cache format yet + # If we don't apply this, prefix-tuning fails to update cross-attn cache + # TODO: remove check for _supports_cache_class once transformers 4.53 is no longer supported + # TODO: remove from_legacy_cache once transformers < 4.56 is dropped + transformers_lt_4_56 = packaging.version.parse(transformers.__version__) < packaging.version.parse( + "4.56.0.dev0" + ) + if transformers_lt_4_56: + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + else: + past_key_values = EncoderDecoderCache(past_key_values) + + past_key_values.cross_attention_cache = DynamicCache() + # invalidate the cross attention cache, since we add virtual tokens to the encoder + for key in past_key_values.is_updated.keys(): + past_key_values.is_updated[key] = False + map_cache_to_layer_device_map(self.get_base_model(), past_key_values) # no-op if not a Cache instance + return past_key_values + else: + if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompts = prompt_encoder(prompt_tokens, task_ids) + else: + if peft_config.inference_mode: + prompts = prompt_encoder.embedding.weight + else: + # Take only one prompt token sample and expand the output instead of expanding the input, see: + # https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577 + prompt_tokens = prompt_tokens[:1] + prompts = prompt_encoder(prompt_tokens) + prompts = prompts.repeat(batch_size, 1, 1) + return prompts + + def get_nb_trainable_parameters(self) -> tuple[int, int]: + r""" + Returns the number of trainable parameters and the number of all parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in self.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + if hasattr(param, "element_size"): + num_bytes = param.element_size() + elif not hasattr(param, "quant_storage"): + num_bytes = 1 + else: + num_bytes = param.quant_storage.itemsize + num_params = num_params * 2 * num_bytes + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + def print_trainable_parameters(self) -> None: + """ + Prints the number of trainable parameters in the model. + + Note: print_trainable_parameters() uses get_nb_trainable_parameters() which is different from + num_parameters(only_trainable=True) from huggingface/transformers. get_nb_trainable_parameters() returns + (trainable parameters, all parameters) of the Peft Model which includes modified backbone transformer model. + For techniques like LoRA, the backbone transformer model is modified in place with LoRA modules. However, for + prompt tuning, the backbone transformer model is unmodified. num_parameters(only_trainable=True) returns number + of trainable parameters of the backbone transformer model which can be different. + """ + trainable_params, all_param = self.get_nb_trainable_parameters() + + print( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}" + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "base_model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.base_model, name) + + @contextmanager + def _enable_peft_forward_hooks(self, *args, **kwargs): + # If the base model has a method called _enable_peft_forward_hooks, it is invoked as a context. Otherwise, this + # runs without any changes + if hasattr(self.base_model, "_enable_peft_forward_hooks"): + with self.base_model._enable_peft_forward_hooks(*args, **kwargs): + yield + return + else: + # nothing to enable + yield + return + + def forward(self, *args: Any, **kwargs: Any): + """ + Forward pass of the model. + """ + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.get_base_model()(*args, **kwargs) + + def generate(self, *args, **kwargs): + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.get_base_model().generate(*args, **kwargs) + + def _get_base_model_class(self, is_prompt_tuning=False): + """ + Returns the base model class. + """ + if not is_prompt_tuning: + return self.base_model.model.__class__ + return self.base_model.__class__ + + @contextmanager + def disable_adapter(self): + """ + Context manager that disables the adapter module. Use this to run inference on the base model. + + Example: + + ```py + >>> with model.disable_adapter(): + ... model(inputs) + ``` + """ + if self.peft_config[self.active_adapter].is_prompt_learning: + try: + # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and + # letting the underlying methods deal with it, same as how LoRA does it. + old_forward = self.forward + self.forward = self.base_model.forward + old_prepare_inputs_for_generation = self.prepare_inputs_for_generation + self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + yield + finally: + self.forward = old_forward + self.prepare_inputs_for_generation = old_prepare_inputs_for_generation + + elif self.peft_config[self.active_adapter].is_adaption_prompt: + try: + self.base_model.disable_adapter_layers() + yield + finally: + self.base_model.enable_adapter_layers() + + else: # LoRA, LoHa, etc. + model_status = self.get_model_status() + if model_status.enabled == "irregular": + warnings.warn( + "The model contains some adapter layers that are enabled and others that are disabled. " + "This is most likely unintentional. After exiting the disable_adapter context, all adapters " + "will be enabled" + ) + try: + self.base_model.disable_adapter_layers() + yield + finally: + if model_status.enabled is not False: + # model_status.enabled is `True` or `"irregular"` + self.base_model.enable_adapter_layers() + + def get_base_model(self) -> torch.nn.Module: + """ + Returns the base model. + """ + return self.base_model if self.active_peft_config.is_prompt_learning else self.base_model.model + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. Don't use this option when creating a new PEFT adapter for training. + + """ + prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(peft_config.peft_type) + if prefix and adapter_name in prefix: + warnings.warn( + f"Adapter name '{adapter_name}' should not be contained in the prefix '{prefix}'. " + "This may lead to reinitialization of the adapter weights during loading." + ) + + if peft_config.peft_type != self.peft_type: + raise ValueError( + f"Cannot combine adapters with different peft types. " + f"Found {self.peft_type} and {peft_config.peft_type}." + ) + + try: + if peft_config.is_prompt_learning: + self.peft_config[adapter_name] = peft_config + if hasattr(self.config, "to_dict"): + dict_config = self.config.to_dict() + else: + dict_config = self.config + + peft_config = _prepare_prompt_learning_config(peft_config, dict_config) + self._setup_prompt_encoder(adapter_name) + set_additional_trainable_modules( + model=self.base_model, + peft_config=peft_config, + model_config=BaseTuner.get_model_config(self), + adapter_name=adapter_name, + ) + elif peft_config.is_adaption_prompt: + self.base_model.add_adapter(adapter_name, peft_config) + set_additional_trainable_modules( + model=self.base_model, + peft_config=peft_config, + model_config=BaseTuner.get_model_config(self), + adapter_name=adapter_name, + ) + else: + self.peft_config[adapter_name] = peft_config + self.base_model.inject_adapter( + self.base_model.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage + ) + except Exception: # something went wrong, roll back + if adapter_name in self.peft_config: + del self.peft_config[adapter_name] + raise + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter {adapter_name} does not exist") + + self.base_model.delete_adapter(adapter_name=adapter_name) + new_active_adapters = self.active_adapters + num_adapters = len(new_active_adapters) + # Note: PeftModel assumes that there is exactly one active adapter, so we should theoretically raise if + # num_adapters != 1. However, we have allowed this in the past (maybe inadvertently), so we let it slip and + # don't introduce a backwards incompatibility by raising an error. + if num_adapters == 1: + self.active_adapter = new_active_adapters[0] + + @property + def modules_to_save(self) -> Optional[set[str]]: + modules: set[str] = set() + for config in self.peft_config.values(): + if getattr(config, "modules_to_save", None) is not None: + # modules_to_save can only be a sequence of str, not a str + modules.update(config.modules_to_save) + + if not modules: + # for backwards compatibility, as modules_to_save was initialized as None + return None + return modules + + def get_layer_status(self) -> list[TunerLayerStatus]: + """Get the status of each adapter layer in the model. + + This method returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following + attributes: + + - `name` (`str`): + The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. + - `module_type` (`str`): + The type of the adapter layer, e.g. `lora.Linear`. + - `enabled` (`bool`): + Whether the adapter layer is enabled. + - `active_adapters` (`list[str]`): + The names of the active adapters, if any, e.g. `["default"]`. + - `merged_adapters` (`list[str]`): + The names of the merged adapters, if any, e.g. `["default"]`. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + + Args: + model ([`~PeftModel`]): + The model to get the adapter layer status from. + + Returns: + list[`peft.peft_model.TunerLayerStatus`]: + A list of dataclasses, each containing the status of the corresponding adapter layer. + + """ + return get_layer_status(self) + + def get_model_status(self) -> TunerModelStatus: + """Get the status of tuners of the model. + + This method returns a `TunerModelStatus` dataclass instance, which contains the following attributes: + + - `base_model_type` (`str`): + The type of the base model, e.g. `T5Model`. + - `adapter_model_type` (`str`): + The type of the adapter model, e.g. `LoraModel`. + - `peft_types` (`dict[str, str]`): + The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. + - `trainable_params` (`int`): + The number of trainable parameters in the model. + - `total_params` (`int`): + The total number of parameters in the model. + - `num_adapter_layers` (`int`): + The number of adapter layers in the model. + - `enabled` (`bool`, `Literal["irregular"]`): + Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. + This means that your model is in an inconsistent state and might not work as expected. + - `active_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the active adapters. If the active adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `merged_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + + Args: + model ([`~PeftModel`]): + The model to get the adapter layer status from. + + Returns: + `peft.peft_model.TunerModelStatus`: + A dataclass containing the status of the model. + + """ + return get_model_status(self) + + @classmethod + def _split_kwargs(cls, kwargs: dict[str, Any]): + _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",) + hf_hub_download_kwargs = {} + other_kwargs = {} + + for key, value in kwargs.items(): + if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature: + hf_hub_download_kwargs[key] = value + else: + other_kwargs[key] = value + + return hf_hub_download_kwargs, other_kwargs + + def _update_offload(self, offload_index: dict[str, dict[str, str]], adapters_weights: dict[str, torch.tensor]): + """ + Update the offload_index and safetensors files for loading and mergine PeftModels with disk-offloaded modules. + + Args: + offload_index (Dict[str: str]): + Dictionary of disk-offloaded modules with their metadata and safetensors filenames + adapters_weights (Dict[str: torch.tensor]): + Dictionary of Peft adapter module names and weights + """ + + if not offload_index: + return offload_index + + prefix = "base_model.model." + # rename offload index weight and model names + adapter_names = list(self.peft_config.keys()) + for adapter_name in adapter_names: + keys = list(offload_index.keys()) + block_id = keys[0].split(".")[0] + "." # for writing safetensors key, + + # replace original offload index keys with PeftModel keys + for key in keys: + suffix_pos = key.rfind(".") + extended_prefix = prefix + key[:suffix_pos] + module = dict(self.named_modules())[extended_prefix] + if isinstance(module, BaseTunerLayer): + new_key = prefix + key[:suffix_pos] + ".base_layer" + key[suffix_pos:] + else: + new_key = prefix + key + offload_index[key]["weight_name"] = new_key + offload_index[new_key] = offload_index[key] + del offload_index[key] + + files_seen = set() + # rename safetensors for dispatch + for new_key in list(offload_index.keys()): + fname = offload_index[new_key]["safetensors_file"] + + # make a new file name + new_fname_list = list(fname.split(os.sep)) + for i, name in enumerate(new_fname_list): + if "--" in name: + new_fname_list[i] += "-peft" + break + new_fname = os.path.join(*new_fname_list) + + if fname in files_seen: + continue + safe_dict = {} + with safe_open(fname, framework="pt") as f: + for safe_key in f.keys(): + safe_tensor = f.get_tensor(safe_key) + metadata = f.metadata() + suffix_pos = safe_key.rfind(".") + extended_prefix = prefix + block_id + safe_key[:suffix_pos] + safe_module = dict(self.named_modules())[extended_prefix] + if isinstance(safe_module, BaseTunerLayer): + final_key = extended_prefix + ".base_layer" + safe_key[suffix_pos:] + lora_dict = {key: val for key, val in adapters_weights.items() if extended_prefix in key} + + # add LoRA keys and values to disk offload + for lora_key, lora_val in lora_dict.items(): + divide = lora_key.rfind(".") + new_key = lora_key[:divide] + f".{adapter_name}" + lora_key[divide:] + safe_dict[new_key] = lora_val + else: + final_key = prefix + block_id + safe_key + safe_dict[final_key] = safe_tensor + files_seen.add(new_fname) + + # avoid overwriting original safetensors + for key in safe_dict.keys(): + offload_index[key] = {"safetensors_file": new_fname, "weight_name": key} + + base_name = os.path.dirname(new_fname) + if not os.path.exists(base_name): + os.makedirs(base_name) + safe_save_file(safe_dict, new_fname, metadata=metadata) + + def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None: + """Perform checks on newly added PEFT configs to ensure integrity.""" + if peft_config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + + # Since PiSSA/CorDA/OLoRA modifies the base weights, it should not be combined with other adapters. + all_configs = [peft_config] + list(self.peft_config.values()) + if len(all_configs) > 1: + if any(getattr(config, "init_lora_weights", None) == "pissa" for config in all_configs): + msg = ( + "PiSSA changes the base weights of the model and should thus not be used with other adapters. " + "Consider converting the PiSSA adapter into a normal LoRA adapter: " + "https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning#convert-pissa-to-lora" + ) + warnings.warn(msg) + elif any(getattr(config, "init_lora_weights", None) == "corda" for config in all_configs): + msg = ( + "CorDA changes the base weights of the model and should thus not be used with other adapters. " + "Consider converting the CorDA adapter into a normal LoRA adapter: " + "https://github.com/huggingface/peft/tree/main/examples/corda_finetuning#convert-corda-to-lora" + ) + warnings.warn(msg) + elif any(getattr(config, "init_lora_weights", None) == "olora" for config in all_configs): + msg = ( + "OLoRA changes the base weights of the model and should thus not be used with other adapters. " + "Consider converting the OLoRA adapter into a normal LoRA adapter: " + "https://github.com/huggingface/peft/tree/main/examples/olora_finetuning#olora-and-lora" + ) + warnings.warn(msg) + + def load_adapter( + self, + model_id: Union[str, os.PathLike], + adapter_name: str, + is_trainable: bool = False, + torch_device: Optional[str] = None, + autocast_adapter_dtype: bool = True, + ephemeral_gpu_offload: bool = False, + low_cpu_mem_usage: bool = False, + key_mapping: Optional[dict[str, str]] = None, + **kwargs: Any, + ): + """ + Load a trained adapter into the model. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`): + The name of the adapter to be added. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + torch_device (`str`, *optional*, defaults to None): + The device to load the adapter on. If `None`, the device will be inferred. + autocast_adapter_dtype (`bool`, *optional*, defaults to `True`): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter + weights using float16 and bfloat16 to float32, as this is typically required for stable training, and + only affect select PEFT tuners. + ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`): + Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + key_mapping (dict, *optional*, defaults to None) + Extra mapping of PEFT `state_dict` keys applied before loading the `state_dict`. When this mapping is + applied, the PEFT-specific `"base_model.model"` prefix is removed beforehand and the adapter name (e.g. + `"default"`) is not inserted yet. Only pass this argument if you know what you're doing. + kwargs: (`optional`): + Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub. + """ + from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs) + if torch_device is None: + torch_device = infer_device() + + if adapter_name not in self.peft_config: + # load the config + peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + **hf_hub_download_kwargs, + ) + ].from_pretrained( + model_id, + ephemeral_gpu_offload=ephemeral_gpu_offload, + **hf_hub_download_kwargs, + ) + self._check_new_adapter_config(peft_config, is_trainable=is_trainable) + peft_config.inference_mode = not is_trainable + self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + + adapters_weights = load_peft_weights( + model_id, device=torch_device, key_mapping=key_mapping, **hf_hub_download_kwargs + ) + + # load the weights into the model + ignore_mismatched_sizes = kwargs.get("ignore_mismatched_sizes", False) + load_result = set_peft_model_state_dict( + self, + adapters_weights, + adapter_name=adapter_name, + ignore_mismatched_sizes=ignore_mismatched_sizes, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + tuner = self.peft_config[adapter_name].peft_type + tuner_prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(tuner, "") + adapter_missing_keys = [] + + # Filter missing keys specific to the current adapter and tuner prefix. + for key in load_result.missing_keys: + if tuner_prefix in key and adapter_name in key: + adapter_missing_keys.append(key) + + load_result.missing_keys.clear() + load_result.missing_keys.extend(adapter_missing_keys) + + if ( + (getattr(self, "hf_device_map", None) is not None) + and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0) + and len(self.peft_config) == 1 + ): + device_map = kwargs.get("device_map", "auto") + max_memory = kwargs.get("max_memory", None) + offload_folder = kwargs.get("offload_folder", None) + offload_dir = kwargs.get("offload_dir", None) + offload_index = kwargs.get("offload_index", None) + + if offload_dir is not None and offload_folder is not None: + # see https://github.com/huggingface/peft/issues/2541 + raise ValueError("Cannot use `offload_folder` when `offload_dir` is specified.") + elif offload_dir is None: + # to keep backwards compatibility + offload_dir = offload_folder + + dispatch_model_kwargs = {} + # Safety checker for previous `accelerate` versions + # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/ + if "offload_index" in inspect.signature(dispatch_model).parameters: + dispatch_model_kwargs["offload_index"] = offload_index + + no_split_module_classes = self._no_split_modules + + if device_map != "sequential": + max_memory = get_balanced_memory( + self, + max_memory=max_memory, + no_split_module_classes=no_split_module_classes, + low_zero=(device_map == "balanced_low_0"), + ) + + if isinstance(device_map, str): + device_map = infer_auto_device_map( + self, max_memory=max_memory, no_split_module_classes=no_split_module_classes + ) + + self._update_offload(offload_index, adapters_weights) + dispatch_model_kwargs["offload_index"] = offload_index + + dispatch_model( + self, + device_map=device_map, + offload_dir=offload_dir, + **dispatch_model_kwargs, + ) + + hook = AlignDevicesHook(io_same_device=True) + if self.peft_config[adapter_name].is_prompt_learning: + remove_hook_from_submodules(self.prompt_encoder) + add_hook_to_module(self.get_base_model(), hook) + + if hasattr(self.base_model, "_cast_adapter_dtype"): + self.base_model._cast_adapter_dtype( + adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype + ) + + # Set model in evaluation mode to deactivate Dropout modules by default + if not is_trainable: + self.eval() + return load_result + + def set_adapter(self, adapter_name: str) -> None: + """ + Sets the active adapter. + + Only one adapter can be active at a time. + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str`): + The name of the adapter to be set as active. The adapter must be loaded first. + """ + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter {adapter_name} not found.") + self.active_adapter = adapter_name + if not self.peft_config[adapter_name].is_prompt_learning: + # _set_adapter does not need to be called, since it's called through the BaseTuner class. + self.base_model.set_adapter(adapter_name) + else: + # handle auxiliary modules + _set_adapter(self, adapter_name) + + def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None: + """ + Enable or disable gradients on the given adapter(s). + + Note: Not supported for prompt learning methods like prompt tuning. + + Args: + adapter_name (`str` or `Sequence[str]`): + The name of the adapter(s) whose gradients should be enabled/disabled. + requires_grad (`bool`, *optional*) + Whether to enable (`True`, default) or disable (`False`). + """ + if self.active_peft_config.is_prompt_learning: + raise TypeError( + "Setting `requires_grad` is not supported for prompt learning methods like " + f"{self.active_peft_config.peft_type.value}." + ) + + self.base_model.set_requires_grad(adapter_names=adapter_names, requires_grad=requires_grad) + + @property + def base_model_torch_dtype(self): + return getattr(self.base_model, "dtype", None) + + @property + def active_peft_config(self): + return self.peft_config[self.active_adapter] + + def _get_peft_specific_model_tags(self): + """Derive tags for the model card from the adapter's config. For example, setting the + base model is important for enabling support for HF inference providers but it also makes models more + searchable on the HF hub. + """ + peft_method = self.active_peft_config.peft_type + if not isinstance(peft_method, str): + peft_method = peft_method.value + + tags = [] + + if hasattr(self.base_model, "model") and isinstance(self.base_model.model, transformers.PreTrainedModel): + tags.append("transformers") + + if peft_method == "LORA": + tags.append("lora") + + if hasattr(self.base_model, "name_or_path"): + tags.append(f"base_model:adapter:{self.base_model.name_or_path}") + + return tags + + def create_or_update_model_card(self, output_dir: str): + """ + Updates or create model card to include information about peft: + 1. Adds `peft` library tag + 2. Adds peft version + 3. Adds base model info + 4. Adds quantization information if it was used + """ + + filename = os.path.join(output_dir, "README.md") + + card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData()) + + card.data["library_name"] = "peft" + + tags = set() + base_model = self.get_base_model() + if hasattr(base_model, "model_tags"): + tags = tags.union(base_model.model_tags or []) + + tags = tags.union(self._get_peft_specific_model_tags()) + if tags: + card.data["tags"] = sorted(tags) + + # One of the rare moments where we can select the pipeline tag with certainty, so let's do that. + # Makes it easier to deploy an adapter with auto inference since the user doesn't have to add any tags. + if not card.data.pipeline_tag and isinstance(self, PeftModelForCausalLM): + card.data.pipeline_tag = "text-generation" + + model_config = BaseTuner.get_model_config(self) + model_config = None if model_config == DUMMY_MODEL_CONFIG else model_config + if model_config is not None and "_name_or_path" in model_config: + card.data["base_model"] = model_config["_name_or_path"] + + lines = card.text.splitlines() + + quantization_config = None + if hasattr(model_config, "quantization_config"): + quantization_config = self.config.quantization_config.to_dict() + training_config_text = "" + quantization_prefix = "The following `bitsandbytes` quantization config was used during training:" + # Adds quantization information if it was used + if quantization_config is not None: + training_config_text += f"\n{quantization_prefix}\n" + training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()]) + training_config_text += "\n" + + training_procedure_heading = "## Training procedure" + if quantization_prefix not in lines and bool(training_config_text): + if training_procedure_heading in lines: + lines.insert(lines.index(training_procedure_heading) + 2, training_config_text) + else: + lines.append(f"{training_procedure_heading}\n{training_config_text}") + + # Adds peft version + framework_block_heading = "### Framework versions" + if f"- PEFT {__version__}" not in lines: + if framework_block_heading in lines: + lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}") + else: + lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}") + + card.text = "\n".join(lines) + card.save(filename) + + +class PeftModelForSequenceClassification(PeftModel): + """ + Peft model for sequence classification tasks. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForSequenceClassification + >>> from peft import PeftModelForSequenceClassification, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "SEQ_CLS", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 768, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 12, + ... "num_layers": 12, + ... "encoder_hidden_size": 768, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForSequenceClassification(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs + ) -> None: + classifier_module_names = ["classifier", "score"] + + if hasattr(peft_config, "modules_to_save"): + if peft_config.modules_to_save is None: + peft_config.modules_to_save = classifier_module_names[:] + else: + peft_config.modules_to_save.extend(classifier_module_names) + + # The modification of peft_config must happen before the init call as the `modules_to_save` information + # will be used to guard the target layer matching against matching `modules_to_save` layers. Only the + # config is relevant for this, the `modules_to_save` attribute can follow later. + super().__init__(model, peft_config, adapter_name, **kwargs) + + if hasattr(peft_config, "modules_to_save"): + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper + _set_trainable( + self, + adapter_name, + module_names=getattr(peft_config, "modules_to_save", None), + inference_mode=peft_config.inference_mode, + ) + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. Don't use this option when creating a new PEFT adapter for training. + + """ + # ensure that additional adapters also add the classifier layer to modules_to_save + if hasattr(peft_config, "modules_to_save"): + classifier_module_names = ["classifier", "score"] + if peft_config.modules_to_save is None: + peft_config.modules_to_save = classifier_module_names[:] + else: + peft_config.modules_to_save.extend(classifier_module_names) + + return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(labels=labels, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + pooled_output = outputs[1] if len(outputs) > 1 else outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + pooled_output = self.base_model.dropout(pooled_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.base_model.num_labels == 1: + self.config.problem_type = "regression" + elif self.base_model.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.base_model.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.base_model.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForCausalLM(PeftModel): + """ + Peft model for causal language modeling. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModelForCausalLM, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "CAUSAL_LM", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 1280, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 20, + ... "num_layers": 36, + ... "encoder_hidden_size": 1280, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large") + >>> peft_model = PeftModelForCausalLM(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs + ) -> None: + super().__init__(model, peft_config, adapter_name, **kwargs) + self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + + if not peft_config.is_prompt_learning: + # Adds alora_offsets to kwargs if relevant. No other modifications. + kwargs = get_alora_offsets_for_forward(self, input_ids, inputs_embeds, **kwargs) + if self.base_model.config.model_type == "mpt": + if inputs_embeds is not None: + raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + # overwrite past_kv in kwargs + # some archs require max_cache_len to re-initialize the cache + if input_ids is not None: + max_cache_len = input_ids.shape[1] + peft_config.num_virtual_tokens + else: + max_cache_len = inputs_embeds.shape[1] + peft_config.num_virtual_tokens + kwargs["past_key_values"] = self.get_prompt(batch_size, max_cache_len=max_cache_len) + return self.base_model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs) + elif peft_config.peft_type == PeftType.CPT: + return self._cpt_forward(input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + # concat prompt labels + if labels is not None: + prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) + kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _cpt_forward(self, input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs): + # Extract labels from kwargs + labels = kwargs.pop("labels") + device = [i.device for i in [input_ids, inputs_embeds, labels] if i is not None][0] + # Extract input_type_mask from kwargs and move it to the same device as labels + if "input_type_mask" in kwargs.keys(): + input_type_mask = kwargs.pop("input_type_mask").to(device) + else: + if input_ids is None: + N_tokens = inputs_embeds.shape[1] + else: + N_tokens = input_ids.shape[1] + input_type_mask = torch.ones((batch_size, N_tokens)).to(device) * 4 + + cpt_token_ids = peft_config.cpt_token_ids + cpt_tokens_type_mask = peft_config.cpt_tokens_type_mask + + # Generate embeddings if not provided + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + # Get prompt and concatenate with input embeddings + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + # If labels are provided, generate prefix labels and type mask + cpt_labels = None + if labels is not None: + # Generate prefix labels and concatenate with the input labels + prefix_labels = torch.Tensor(cpt_token_ids).long().view(1, -1) + prefix_labels = prefix_labels.repeat(batch_size, 1).to(labels.device) + cpt_labels = torch.cat((prefix_labels, labels), dim=1) + # Generate prefix type mask and shift input type mask values to avoid conflicts + prefix_type_mask = torch.Tensor(cpt_tokens_type_mask).long().view(1, -1) + prefix_type_mask = prefix_type_mask.repeat(batch_size, 1).to(labels.device) + adjusted_input_type_mask = input_type_mask + adjusted_input_type_mask[adjusted_input_type_mask > 0] += prefix_type_mask.max() + # Concatenate prefix and shifted input type masks + cpt_type_mask = torch.cat((prefix_type_mask, adjusted_input_type_mask), dim=1) + # Identify valid label positions and mask invalid ones with -100 + labels_idx = (cpt_type_mask > 0) & (cpt_type_mask % 4 == 0) + cpt_labels[~labels_idx] = -100 + # Update kwargs with the modified labels + + kwargs["labels"] = cpt_labels + # Pass the modified inputs to the base model + base_model_output = self.base_model(inputs_embeds=inputs_embeds, **kwargs) + if labels is None: + return base_model_output + else: + # Calculate the loss using the custom CPT loss function + cpt_embedding = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type] + base_model_output = cpt_embedding.calculate_loss( + base_model_output, cpt_labels, cpt_type_mask, self.peft_config["default"] + ) + return base_model_output + + def generate(self, *args, **kwargs): + peft_config = self.active_peft_config + self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation + if hasattr(self.base_model, "model"): + self.base_model.model.generation_config = self.generation_config + else: + self.base_model.generation_config = self.generation_config + try: + if not peft_config.is_prompt_learning: + # Adds alora_offsets to kwargs if relevant. No other changes. + kwargs = get_alora_offsets_for_generate(self, *args, **kwargs) + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + outputs = self.base_model.generate(*args, **kwargs) + else: + outputs = self.base_model.generate(**kwargs) + except: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + raise + else: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + return outputs + + def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs): + peft_config = self.active_peft_config + model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) + + # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format + # for some architectures which requires a special fix for prompt tuning etc. + # TODO: starting with transformers 4.38, all architectures should support caching. + uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0") + uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0") + transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"] + if packaging.version.parse(transformers.__version__) > packaging.version.parse("4.43.3"): + # https://github.com/huggingface/transformers/pull/31445 + transformers_new_cache_archs.append("bloom") + + uses_cache = uses_transformers_4_38 or ( + uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs + ) + + # heuristic to determine if we're in 'prefill stage' (when the KV cache is filled with the values from the + # initial input) + is_prefill = (model_kwargs.get("cache_position") is not None) and (model_kwargs["cache_position"][0] == 0) + + if peft_config.peft_type == PeftType.POLY: + model_kwargs["task_ids"] = task_ids + if peft_config.is_prompt_learning: + if uses_cache and (model_kwargs.get("past_key_values", None) is not None): + # change in the logic of `prepare_inputs_for_generation` makes the below code necessary + # In prompt learning methods, past key values are longer when compared to the `input_ids`. + # As such only consider the last input ids in the autogressive generation phase. + past_key_values = model_kwargs["past_key_values"] + if isinstance(past_key_values, (tuple, list)): + seq_len = past_key_values[0][0].shape[-2] + else: # using transformers kv cache + seq_len = past_key_values.get_seq_length() + if seq_len >= model_kwargs["input_ids"].shape[1]: + model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:] + + if (attention_mask := model_kwargs.get("attention_mask", None)) is not None: + if isinstance(attention_mask, dict): + # see: https://github.com/huggingface/transformers/pull/37866 + # For now, just deal with the case of a single attention mask + if len(attention_mask) != 1: + raise ValueError( + f"Expected a single attention mask, got {len(attention_mask)} instead, please open an " + "issue (https://github.com/huggingface/peft/issues) and report the error." + ) + attention_mask = list(attention_mask.values())[0] + + size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens + prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device) + if attention_mask.dim() == 4: + # Transform the 4d attention mask to 2d, leave it up to the model to deal with it instead of trying + # to create a 4d attention mask here. + # from [batch_size, heads, input_ids_length, total_sequence_length] + # to [batch_size, total_sequence_length] + bs = attention_mask.shape[0] + total_seq_len = prefix_attention_mask.shape[1] + attention_mask.shape[2] + attention_mask_2d = torch.ones((bs, total_seq_len), dtype=attention_mask.dtype) + + if is_prefill and (peft_config.peft_type != PeftType.PREFIX_TUNING): + # if in prefill stage, for prompt learning methods that are not prefix tuning, new tokens + # (embeddings) are inserted, thus set cache_position to correspond to these tokens + cache_position_ = torch.arange(total_seq_len, device=model_kwargs["input_ids"].device) + else: + # prefix tuning acts directly on the cache, no need to upate cache_position + cache_position_ = model_kwargs["cache_position"] + + attention_mask_new = create_attention_mask( + self.get_base_model(), + model_input=None, + attention_mask=attention_mask_2d, + past_key_values=model_kwargs.get("past_key_values"), + cache_position=cache_position_, + batch_size=bs, + sequence_length=total_seq_len, + position_ids=model_kwargs.get("position_ids", None), + ) + model_kwargs["attention_mask"] = attention_mask_new + else: + # 2d attention mask + model_kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if model_kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + model_kwargs["position_ids"] = None + + if kwargs.get("token_type_ids", None) is not None: + warnings.warn( + "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" + ) + kwargs["token_type_ids"] = None + + cache: transformers.Cache | None = model_kwargs.get("past_key_values", None) + # no past_key_values or past_key_values empty cache + requires_prompt_injection = (cache is None) or ( + isinstance(cache, transformers.Cache) and not cache.get_seq_length() + ) + + if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING: + # some archs require max_cache_len to re-initialize the cache, but DynamicCache has no max len + if isinstance(cache, transformers.Cache) and not isinstance(cache, transformers.DynamicCache): + max_cache_len = cache.max_cache_len + else: + max_cache_len = -1 # -1 means no max length + new_past_key_values = self.get_prompt( + batch_size=model_kwargs["input_ids"].shape[0], + max_cache_len=max_cache_len, + ) + model_kwargs["past_key_values"] = new_past_key_values + elif requires_prompt_injection: + inputs_embeds = self.word_embeddings(model_kwargs["input_ids"]) + prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1) + model_kwargs["input_ids"] = None + + # if we're in the prefill stage + if is_prefill and (peft_config.peft_type == PeftType.PREFIX_TUNING): + # for prefix tuning, the past_key_values have been prefilled + model_kwargs["cache_position"] += peft_config.num_virtual_tokens + elif peft_config.peft_type != PeftType.PREFIX_TUNING: # prefix tuning needs cache_position + # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is passed in the forward + # pass to keep track of the position ids of the cache. We have to pop that from `model_kwargs` as + # `cache_position` is properly created by the model, using the passed `inputs_embeds`: + # https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956 + _ = model_kwargs.pop("cache_position", None) + + return model_kwargs + + +class PeftModelForSeq2SeqLM(PeftModel): + """ + Peft model for sequence-to-sequence language modeling. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM + >>> from peft import PeftModelForSeq2SeqLM, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "SEQ_2_SEQ_LM", + ... "inference_mode": False, + ... "r": 8, + ... "target_modules": ["q", "v"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.1, + ... "fan_in_fan_out": False, + ... "enable_lora": None, + ... "bias": "none", + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> peft_model = PeftModelForSeq2SeqLM(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 884736 || all params: 223843584 || trainable%: 0.3952474242013566 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs + ) -> None: + super().__init__(model, peft_config, adapter_name, **kwargs) + self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + self.base_model_prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model._prepare_encoder_decoder_kwargs_for_generation + ) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_inputs_embeds=decoder_inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if decoder_attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + decoder_attention_mask.device + ) + if peft_config.peft_type not in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]: + decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + # overwrite past_kv in kwargs + kwargs["past_key_values"] = self.get_prompt(batch_size) + return self.base_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + decoder_inputs_embeds=decoder_inputs_embeds, + **kwargs, + ) + elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + attention_mask.device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + + return self.base_model( + inputs_embeds=inputs_embeds, + decoder_input_ids=decoder_input_ids, + decoder_inputs_embeds=decoder_inputs_embeds, + **kwargs, + ) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + if decoder_inputs_embeds is None and decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + decoder_inputs_embeds = self.word_embeddings(decoder_input_ids) + + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + attention_mask.device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1) + # concat prompt labels + if labels is not None: + if peft_config.num_transformer_submodules == 1: + kwargs["labels"] = labels + elif peft_config.num_transformer_submodules == 2: + prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) + kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + if peft_config.num_transformer_submodules == 1: + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + elif peft_config.num_transformer_submodules == 2: + decoder_inputs_embeds = torch.cat( + (prompts[:, peft_config.num_virtual_tokens :], decoder_inputs_embeds), dim=1 + ) + return self.base_model( + inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **kwargs + ) + + def generate(self, **kwargs): + peft_config = self.active_peft_config + self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self._prepare_encoder_decoder_kwargs_for_generation + ) + try: + if not peft_config.is_prompt_learning: + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + outputs = self.base_model.generate(**kwargs) + else: + if "input_ids" not in kwargs: + raise ValueError("input_ids must be provided for Peft model generation") + if kwargs.get("position_ids", None) is not None: + warnings.warn( + "Position ids are not supported for parameter efficient tuning. Ignoring position ids." + ) + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn( + "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" + ) + kwargs["token_type_ids"] = None + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + outputs = self.base_model.generate(**kwargs) + elif peft_config.peft_type in [ + PeftType.PROMPT_TUNING, + PeftType.P_TUNING, + PeftType.MULTITASK_PROMPT_TUNING, + ]: + kwargs = deepcopy(kwargs) + + if "encoder_outputs" in kwargs: + del kwargs["encoder_outputs"] + warnings.warn( + "`encoder_outputs` should not be passed to `generate` when using prompt tuning. Ignoring it." + ) + + input_ids = kwargs.pop("input_ids") + inputs_embeds = self.word_embeddings(input_ids) + batch_size = inputs_embeds.shape[0] + prompts = self.get_prompt(batch_size=batch_size, task_ids=kwargs.pop("task_ids", None)) + prompts = prompts.to(inputs_embeds.dtype) + + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + kwargs["inputs_embeds"] = inputs_embeds + + if "attention_mask" in kwargs: + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + kwargs["attention_mask"].device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, kwargs["attention_mask"]), dim=1) + + return self.base_model.generate(**kwargs) + else: + raise NotImplementedError + except: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model_prepare_encoder_decoder_kwargs_for_generation + ) + raise + else: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model_prepare_encoder_decoder_kwargs_for_generation + ) + return outputs + + def prepare_inputs_for_generation(self, *args, task_ids: torch.Tensor = None, **kwargs): + peft_config = self.active_peft_config + model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) + if peft_config.peft_type == PeftType.POLY: + model_kwargs["task_ids"] = task_ids + elif peft_config.peft_type == PeftType.PREFIX_TUNING: + past_key_values = model_kwargs.get("past_key_values", None) + cache_position = model_kwargs.get("cache_position", [None]) + # check prefill stage + is_prefill_stage = ( + # old cache implementation + (past_key_values is None) + # new cache implementation + or (isinstance(past_key_values, Cache) and (cache_position[0] == 0)) + ) + if is_prefill_stage: + batch_size = model_kwargs["decoder_input_ids"].shape[0] + new_past_key_values = self.get_prompt(batch_size) + model_kwargs["past_key_values"] = new_past_key_values + + return model_kwargs + + +class PeftModelForTokenClassification(PeftModel): + """ + Peft model for token classification tasks. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForSequenceClassification + >>> from peft import PeftModelForTokenClassification, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "TOKEN_CLS", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 768, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 12, + ... "num_layers": 12, + ... "encoder_hidden_size": 768, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForTokenClassification.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForTokenClassification(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig = None, adapter_name: str = "default", **kwargs + ) -> None: + super().__init__(model, peft_config, adapter_name, **kwargs) + + classifier_module_names = ["classifier", "score"] + if hasattr(peft_config, "modules_to_save"): + if peft_config.modules_to_save is None: + peft_config.modules_to_save = classifier_module_names[:] + else: + peft_config.modules_to_save.extend(classifier_module_names) + + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper + _set_trainable( + self, + adapter_name, + module_names=getattr(peft_config, "modules_to_save", None), + inference_mode=peft_config.inference_mode, + ) + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. Don't use this option when creating a new PEFT adapter for training. + + """ + # ensure that additional adapters also add the classifier layer to modules_to_save + if hasattr(peft_config, "modules_to_save"): + classifier_module_names = ["classifier", "score"] + if peft_config.modules_to_save is None: + peft_config.modules_to_save = classifier_module_names[:] + else: + peft_config.modules_to_save.extend(classifier_module_names) + + return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if not peft_config.is_prompt_learning: + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(labels=labels, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + sequence_output = outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + sequence_output = self.base_model.dropout(sequence_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForQuestionAnswering(PeftModel): + """ + Peft model for extractive question answering. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForQuestionAnswering + >>> from peft import PeftModelForQuestionAnswering, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "QUESTION_ANS", + ... "inference_mode": False, + ... "r": 16, + ... "target_modules": ["query", "value"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.05, + ... "fan_in_fan_out": False, + ... "bias": "none", + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForQuestionAnswering(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 592900 || all params: 108312580 || trainable%: 0.5473971721475013 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs + ) -> None: + super().__init__(model, peft_config, adapter_name, **kwargs) + + qa_module_names = ["qa_outputs"] + if hasattr(peft_config, "modules_to_save"): + if peft_config.modules_to_save is None: + peft_config.modules_to_save = qa_module_names[:] + else: + peft_config.modules_to_save.extend(qa_module_names) + + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable; this may add a new ModulesToSaveWrapper + _set_trainable( + self, + adapter_name, + module_names=getattr(peft_config, "modules_to_save", None), + inference_mode=peft_config.inference_mode, + ) + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. Don't use this option when creating a new PEFT adapter for training. + + """ + # ensure that additional adapters also add the classifier layer to modules_to_save + if hasattr(peft_config, "modules_to_save"): + qa_module_names = ["qa_outputs"] + if peft_config.modules_to_save is None: + peft_config.modules_to_save = qa_module_names[:] + else: + peft_config.modules_to_save.extend(qa_module_names) + + return super().add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + start_positions=start_positions, + end_positions=end_positions, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "start_positions": start_positions, + "end_positions": end_positions, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(start_positions=start_positions, end_positions=end_positions, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + sequence_output = outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + sequence_output = self.base_model.dropout(sequence_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForFeatureExtraction(PeftModel): + """ + Peft model for extracting features/embeddings from transformer models + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + + Example: + + ```py + >>> from transformers import AutoModel + >>> from peft import PeftModelForFeatureExtraction, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "FEATURE_EXTRACTION", + ... "inference_mode": False, + ... "r": 16, + ... "target_modules": ["query", "value"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.05, + ... "fan_in_fan_out": False, + ... "bias": "none", + ... } + >>> peft_config = get_peft_config(config) + >>> model = AutoModel.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForFeatureExtraction(model, peft_config) + >>> peft_model.print_trainable_parameters() + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs): + super().__init__(model, peft_config, adapter_name, **kwargs) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + # overwrite past_kv in kwargs + kwargs["past_key_values"] = self.get_prompt(batch_size) + return self.base_model(input_ids=input_ids, **kwargs) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + +@dataclass +class TunerLayerStatus: + name: str + module_type: str + enabled: bool + active_adapters: list[str] + merged_adapters: list[str] + requires_grad: dict[str, bool | Literal["irregular"]] + available_adapters: list[str] + devices: dict[str, list[str]] + + +def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]: + """Get the status of each adapter layer in the model. + + This function returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following + attributes: + + - `name` (`str`): + The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. + - `module_type` (`str`): + The type of the adapter layer, e.g. `lora.Linear`. + - `enabled` (`bool`): + Whether the adapter layer is enabled. + - `active_adapters` (`list[str]`): + The names of the active adapters, if any, e.g. `["default"]`. + - `merged_adapters` (`list[str]`): + The names of the merged adapters, if any, e.g. `["default"]`. + - requires_grad : dict[str, bool | Literal["irregular"]] + The requires_grad status of the parameters for each adapter module. Ideally, it should be either `True` or + `False`. If the requires_grad status is not consistent across all parameters, the value will be set to + `"irregular"`. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + - `devices` (`dict[str, list[str]]`): + The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. + + Args: + model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): + The model to get the adapter layer status from. + + Returns: + list[`peft.peft_model.TunerLayerStatus`]: + A list of dataclasses, each containing the status of the corresponding adapter layer. + + """ + if isinstance(model, PeftModel): + base_model = model.base_model + if not isinstance(base_model, BaseTuner): + raise TypeError( + "get_layer_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " + "supported." + ) + else: + base_model = model + + layer_status: list[TunerLayerStatus] = [] + for name, module in base_model.named_modules(): + if not isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)): + continue + if isinstance(module, TrainableTokensWrapper): + # Skip TrainableTokensWrapper, since it wraps TrainableTokensLayer, which is the actual PEFT layer we're + # interested in. + continue + + # determine if all submodules/parameters if this module require grad or not + mapping_requires_grad_list: dict[str, list[bool]] = collections.defaultdict(list) + for adapter_module_name in module.adapter_layer_names: + adapter_module = getattr(module, adapter_module_name) + if isinstance(adapter_module, torch.nn.ModuleDict): + for key, submodule in adapter_module.items(): + for param in submodule.parameters(): + mapping_requires_grad_list[key].append(param.requires_grad) + elif isinstance(adapter_module, torch.nn.ParameterDict): + for key, param in adapter_module.items(): + mapping_requires_grad_list[key].append(param.requires_grad) + else: + # strange, we don't know how to handle this, ignore for now + pass + + def check_irrgular(vals: list[bool]) -> bool | Literal["irregular"]: + if all(vals): + return True + if not any(vals): + return False + return "irregular" + + requires_grad = {key: check_irrgular(vals) for key, vals in mapping_requires_grad_list.items()} + + devices_dd = collections.defaultdict(list) + for adapter_module_name in module.adapter_layer_names + module.other_param_names: + adapter_module = getattr(module, adapter_module_name) + if isinstance(adapter_module, torch.nn.ModuleDict): + for key, submodule in adapter_module.items(): + devices_dd[key].extend([param.device.type for param in submodule.parameters()]) + elif isinstance(adapter_module, torch.nn.ParameterDict) or ( + adapter_module.__class__.__name__ == "BufferDict" + ): # VeRA + for key, param in adapter_module.items(): + devices_dd[key].append(param.device.type) + devices = {key: sorted(set(val)) for key, val in devices_dd.items()} + + status = TunerLayerStatus( + name=name, + module_type=repr(module).partition("(")[0], + enabled=not module.disable_adapters, + active_adapters=module.active_adapters, + merged_adapters=module.merged_adapters, + requires_grad=requires_grad, + available_adapters=sorted(module._get_available_adapters()), + devices=devices, + ) + layer_status.append(status) + + if not layer_status: + raise ValueError( + "No adapter layers found in the model, please ensure that it's a PEFT model or that you have PEFT adapters " + "injected in the model." + ) + + return layer_status + + +@dataclass +class TunerModelStatus: + base_model_type: str + adapter_model_type: str + peft_types: dict[str, str] + trainable_params: int + total_params: int + num_adapter_layers: int + enabled: bool | Literal["irregular"] + active_adapters: list[str] | Literal["irregular"] + merged_adapters: list[str] | Literal["irregular"] + requires_grad: dict[str, bool | Literal["irregular"]] + available_adapters: list[str] + devices: dict[str, list[str]] + + +def get_model_status(model: torch.nn.Module) -> TunerModelStatus: + """Get the status of tuners of the model. + + This function returns a `TunerModelStatus` dataclass instance, which contains the following attributes: + + - `base_model_type` (`str`): + The type of the base model, e.g. `T5Model`. + - `adapter_model_type` (`str`): + The type of the adapter model, e.g. `LoraModel`. + - `peft_types` (`dict[str, str]`): + The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. + - `trainable_params` (`int`): + The number of trainable parameters in the model. + - `total_params` (`int`): + The total number of parameters in the model. + - `num_adapter_layers` (`int`): + The number of adapter layers in the model. + - `enabled` (`bool`, `Literal["irregular"]`): + Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. This + means that your model is in an inconsistent state and might not work as expected. + - `active_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the active adapters. If the active adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `merged_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `requires_grad` (`dict[str, bool | Literal["irregular"]]`): + Whether for the given adapter, all adapter layers have `requires_grad` set to `True` or `False`. If there is a + mix, this will be set to `"irregular"`, which means that your model is in an inconsistent state and might not + work as expected. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + - `devices` (`dict[str, list[str]]`): + The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. + + Args: + model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): + The model to get the adapter layer status from. + + Returns: + `peft.peft_model.TunerModelStatus`: + A dataclass containing the status of the model. + + """ + if isinstance(model, PeftModel): + if not isinstance(model.base_model, BaseTuner): + raise TypeError( + "get_model_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " + "supported." + ) + base_model_type = model.get_base_model().__class__.__name__ + trainable_params, total_params = model.get_nb_trainable_parameters() + base_model = model.base_model + peft_types = {key: str(config.peft_type).partition(".")[-1] for key, config in base_model.peft_config.items()} + adapter_model_type = base_model.__class__.__name__ + elif isinstance(model, PreTrainedModel): + base_model_type = model.__class__.__name__ + trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) + base_model = model + peft_types = {} + adapter_model_type = "None" + else: + base_model_type = "other" + trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) + base_model = model + peft_types = {} + adapter_model_type = "None" + + layer_status = get_layer_status(model) + num_adapter_layers = len(layer_status) + + enabled_set: set[bool] = {status.enabled for status in layer_status} # must be {True}, {False}, or {True, False} + enabled: bool | Literal["irregular"] + if len(enabled_set) == 1: + enabled = enabled_set.pop() + else: + enabled = "irregular" + + available_adapters: list[str] = sorted(set().union(*(status.available_adapters for status in layer_status))) + + # ideally, active adapters should be consistent across all layers of the model, but we cannot guarantee it + all_active_adapters: set[tuple[str, ...]] = {tuple(status.active_adapters) for status in layer_status} + active_adapters: list[str] | Literal["irregular"] + if not all_active_adapters: + active_adapters = [] + elif len(all_active_adapters) == 1: + active_adapters = list(all_active_adapters.pop()) + else: + active_adapters = "irregular" + + # Here we determine what adapters are merged. This is not trivial because multiple adapters can be merged or not at + # the same time. Some layers may only have adapter A, some only adapter B, so it's not as easy as just checking + # which adapters are merged on each layer. + + # First, determine all adapters that are merged on at least on module. + merged_all: set[str] = set() + for status in layer_status: + merged_all.update(status.merged_adapters) + + # Next, check if on any layer, on of these adapters is not merged. + merged_adapters: list[str] | Literal["irregular"] = sorted(merged_all) + for status in layer_status: + unmerged = set(status.available_adapters) - set(status.merged_adapters) + if unmerged & merged_all: + # there is overlap between unmerged adapters and adapters that should be merged + merged_adapters = "irregular" + break + + # check status of requires_grad + # first, merge the values for all layers + requires_grad_all: dict[str, list[bool | Literal["irregular"]]] = collections.defaultdict(list) + for status in layer_status: + for key, val in status.requires_grad.items(): + requires_grad_all[key].append(val) + + # then, check if the values are consistent + def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["irregular"]: + if all(val is True for val in vals): + return True + if all(val is False for val in vals): + return False + return "irregular" + + requires_grad = {key: check_irrgular(vals) for key, vals in requires_grad_all.items()} + + devices_dd = collections.defaultdict(list) + for status in layer_status: + for key, val in status.devices.items(): + devices_dd[key].extend(val) + devices = {key: sorted(set(val)) for key, val in devices_dd.items()} + + adapter_model_status = TunerModelStatus( + base_model_type=base_model_type, + adapter_model_type=adapter_model_type, + peft_types=peft_types, + trainable_params=trainable_params, + total_params=total_params, + num_adapter_layers=num_adapter_layers, + enabled=enabled, + active_adapters=active_adapters, + merged_adapters=merged_adapters, + requires_grad=requires_grad, + available_adapters=available_adapters, + devices=devices, + ) + return adapter_model_status + + +def __getattr__(name): + if name == "PEFT_TYPE_TO_MODEL_MAPPING": + # This is for backwards compatibility: In #2282, PEFT_TYPE_TO_MODEL_MAPPING was removed as it was redundant with + # PEFT_TYPE_TO_TUNER_MAPPING. However, third party code could still use this mapping, e.g.: + # https://github.com/AutoGPTQ/AutoGPTQ/blob/6689349625de973b9ee3016c28c11f32acf7f02c/auto_gptq/utils/peft_utils.py#L8 + # TODO: Remove after 2026-01 + msg = ( + "PEFT_TYPE_TO_MODEL_MAPPING is deprecated, please use `from peft import PEFT_TYPE_TO_TUNER_MAPPING` instead. " + "The deprecated variable will be removed in 2026." + ) + warnings.warn(msg, category=DeprecationWarning) + return PEFT_TYPE_TO_TUNER_MAPPING + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/peft/src/peft/py.typed b/peft/src/peft/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/src/peft/tuners/__init__.py b/peft/src/peft/tuners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c193f89d4958d98c0086ecfeba1260ccc9980e1a --- /dev/null +++ b/peft/src/peft/tuners/__init__.py @@ -0,0 +1,124 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .adalora import AdaLoraConfig, AdaLoraModel +from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel +from .boft import BOFTConfig, BOFTModel +from .bone import BoneConfig, BoneModel +from .c3a import C3AConfig, C3AModel +from .cpt import CPTConfig, CPTEmbedding +from .fourierft import FourierFTConfig, FourierFTModel +from .hra import HRAConfig, HRAModel +from .ia3 import IA3Config, IA3Model +from .ln_tuning import LNTuningConfig, LNTuningModel +from .loha import LoHaConfig, LoHaModel +from .lokr import LoKrConfig, LoKrModel +from .lora import ( + ArrowConfig, + EvaConfig, + LoftQConfig, + LoraConfig, + LoraModel, + LoraRuntimeConfig, + create_arrow_model, + get_eva_state_dict, + initialize_lora_eva_weights, +) +from .miss import MissConfig, MissModel +from .mixed import MixedModel +from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit +from .oft import OFTConfig, OFTModel +from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType +from .poly import PolyConfig, PolyModel +from .prefix_tuning import PrefixEncoder, PrefixTuningConfig +from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit +from .randlora import RandLoraConfig, RandLoraModel +from .road import RoadConfig, RoadModel +from .shira import ShiraConfig, ShiraModel +from .trainable_tokens import TrainableTokensConfig, TrainableTokensModel +from .vblora import VBLoRAConfig, VBLoRAModel +from .vera import VeraConfig, VeraModel +from .waveft import WaveFTConfig, WaveFTModel +from .xlora import XLoraConfig, XLoraModel + + +__all__ = [ + "AdaLoraConfig", + "AdaLoraModel", + "AdaptionPromptConfig", + "AdaptionPromptModel", + "ArrowConfig", + "BOFTConfig", + "BOFTModel", + "BoneConfig", + "BoneModel", + "C3AConfig", + "C3AModel", + "CPTConfig", + "CPTEmbedding", + "EvaConfig", + "FourierFTConfig", + "FourierFTModel", + "HRAConfig", + "HRAModel", + "IA3Config", + "IA3Model", + "LNTuningConfig", + "LNTuningModel", + "LoHaConfig", + "LoHaModel", + "LoKrConfig", + "LoKrModel", + "LoftQConfig", + "LoraConfig", + "LoraModel", + "LoraRuntimeConfig", + "MissConfig", + "MissModel", + "MixedModel", + "MultitaskPromptEmbedding", + "MultitaskPromptTuningConfig", + "MultitaskPromptTuningInit", + "OFTConfig", + "OFTModel", + "PolyConfig", + "PolyModel", + "PrefixEncoder", + "PrefixTuningConfig", + "PromptEmbedding", + "PromptEncoder", + "PromptEncoderConfig", + "PromptEncoderReparameterizationType", + "PromptTuningConfig", + "PromptTuningInit", + "RandLoraConfig", + "RandLoraModel", + "RoadConfig", + "RoadModel", + "ShiraConfig", + "ShiraModel", + "TrainableTokensConfig", + "TrainableTokensModel", + "VBLoRAConfig", + "VBLoRAModel", + "VeraConfig", + "VeraModel", + "WaveFTConfig", + "WaveFTModel", + "XLoraConfig", + "XLoraModel", + "create_arrow_model", + "get_eva_state_dict", + "initialize_lora_eva_weights", +] diff --git a/peft/src/peft/tuners/_buffer_dict.py b/peft/src/peft/tuners/_buffer_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..16e8fae5ac164ee952780fefc1e247f08ff54909 --- /dev/null +++ b/peft/src/peft/tuners/_buffer_dict.py @@ -0,0 +1,159 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# Adapted from https://botorch.org/api/_modules/botorch/utils/torch.html + +# TODO: To be removed once (if) https://github.com/pytorch/pytorch/pull/37385 lands + +from __future__ import annotations + +import collections +from collections import OrderedDict + +import torch +from torch.nn import Module + + +class BufferDict(Module): + r""" + Holds buffers in a dictionary. + + BufferDict can be indexed like a regular Python dictionary, but buffers it contains are properly registered, and + will be visible by all Module methods. `torch.nn.BufferDict` is an **ordered** dictionary that respects + + * the order of insertion, and + * in `torch.nn.BufferDict.update`, the order of the merged `OrderedDict` or another `torch.nn.BufferDict` (the + argument to `torch.nn.BufferDict.update`). + + Note that `torch.nn.BufferDict.update` with other unordered mapping types (e.g., Python's plain `dict`) does not + preserve the order of the merged mapping. + + Args: + buffers (iterable, optional): + a mapping (dictionary) of (string : `torch.Tensor`) or an iterable of key-value pairs of type (string, + `torch.Tensor`) + + ```python + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.buffers = nn.BufferDict({"left": torch.randn(5, 10), "right": torch.randn(5, 10)}) + + def forward(self, x, choice): + x = self.buffers[choice].mm(x) + return x + ``` + """ + + def __init__(self, buffers=None, persistent: bool = False): + r""" + Args: + buffers (`dict`): + A mapping (dictionary) from string to `torch.Tensor`, or an iterable of key-value pairs of type + (string, `torch.Tensor`). + """ + super().__init__() + self.persistent = persistent + + if buffers is not None: + self.update(buffers) + + def __getitem__(self, key): + return self._buffers[key] + + def __setitem__(self, key, buffer): + self.register_buffer(key, buffer, persistent=self.persistent) + + def __delitem__(self, key): + del self._buffers[key] + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.keys()) + + def __contains__(self, key): + return key in self._buffers + + def clear(self): + """Remove all items from the BufferDict.""" + self._buffers.clear() + + def pop(self, key): + r"""Remove key from the BufferDict and return its buffer. + + Args: + key (`str`): + Key to pop from the BufferDict + """ + v = self[key] + del self[key] + return v + + def keys(self): + r"""Return an iterable of the BufferDict keys.""" + return self._buffers.keys() + + def items(self): + r"""Return an iterable of the BufferDict key/value pairs.""" + return self._buffers.items() + + def values(self): + r"""Return an iterable of the BufferDict values.""" + return self._buffers.values() + + def update(self, buffers): + r""" + Update the `torch.nn.BufferDict` with the key-value pairs from a mapping or an iterable, overwriting existing + keys. + + Note: + If `buffers` is an `OrderedDict`, a `torch.nn.BufferDict`, or an iterable of key-value pairs, the order of + new elements in it is preserved. + + Args: + buffers (iterable): + a mapping (dictionary) from string to `torch.Tensor`, or an iterable of key-value pairs of type + (string, `torch.Tensor`). + """ + if not isinstance(buffers, collections.abc.Iterable): + raise TypeError( + "BuffersDict.update should be called with an " + "iterable of key/value pairs, but got " + type(buffers).__name__ + ) + + if isinstance(buffers, (OrderedDict, BufferDict)): + for key, buffer in buffers.items(): + self[key] = buffer + elif isinstance(buffers, collections.abc.Mapping): + for key, buffer in sorted(buffers.items()): + self[key] = buffer + else: + for j, p in enumerate(buffers): + if not isinstance(p, collections.abc.Iterable): + raise TypeError( + "BufferDict update sequence element #" + str(j) + " should be Iterable; is" + type(p).__name__ + ) + if not len(p) == 2: + raise ValueError( + "BufferDict update sequence element " + "#" + str(j) + " has length " + str(len(p)) + "; 2 is required" + ) + self[p[0]] = p[1] + + def extra_repr(self): + child_lines = [] + for k, p in self._buffers.items(): + size_str = "x".join(str(size) for size in p.size()) + device_type = p.device.type + device_str = "" if device_type == "cpu" else f" ({device_type.upper()} {p.get_device()})" + parastr = f"Buffer containing: [{torch.typename(p)} of size {size_str}{device_str}]" + child_lines.append(" (" + k + "): " + parastr) + tmpstr = "\n".join(child_lines) + return tmpstr + + def __call__(self, input): + raise RuntimeError("BufferDict should not be called.") diff --git a/peft/src/peft/tuners/adalora/__init__.py b/peft/src/peft/tuners/adalora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..64d5f3e5ce6ba3c5873f01ee88f4c4766e1fde75 --- /dev/null +++ b/peft/src/peft/tuners/adalora/__init__.py @@ -0,0 +1,43 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.utils import register_peft_method + +from .config import AdaLoraConfig +from .gptq import SVDQuantLinear +from .layer import AdaLoraLayer, RankAllocator, SVDLinear +from .model import AdaLoraModel + + +__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "RankAllocator", "SVDLinear", "SVDQuantLinear"] + + +register_peft_method( + name="adalora", config_cls=AdaLoraConfig, model_cls=AdaLoraModel, prefix="lora_", is_mixed_compatible=True +) + + +def __getattr__(name): + if (name == "SVDLinear8bitLt") and is_bnb_available(): + from .bnb import SVDLinear8bitLt + + return SVDLinear8bitLt + + if (name == "SVDLinear4bit") and is_bnb_4bit_available(): + from .bnb import SVDLinear4bit + + return SVDLinear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/adalora/bnb.py b/peft/src/peft/tuners/adalora/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..fef3d25e65c459d20855e49832aabe56881734c0 --- /dev/null +++ b/peft/src/peft/tuners/adalora/bnb.py @@ -0,0 +1,143 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available + +from .layer import AdaLoraLayer + + +if is_bnb_available(): + + class SVDLinear8bitLt(torch.nn.Module, AdaLoraLayer): + # Low-rank matrix for SVD-based adaptation + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + result = self.base_layer(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + if x.dtype != torch.float32: + x = x.float() + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling / ranknum + # inplace operation on view is forbidden for MatMul8bitLtBackward, so avoid it + result = result + output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep + + +if is_bnb_4bit_available(): + + class SVDLinear4bit(torch.nn.Module, AdaLoraLayer): + # Low-rank matrix for SVD-based adaptation + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + result = self.base_layer(x, *args, **kwargs) + + if self.disable_adapters: + return result + + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.dtype) + + output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling / ranknum + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep diff --git a/peft/src/peft/tuners/adalora/config.py b/peft/src/peft/tuners/adalora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..004c26b0fe5ecf88212bd11de062baec2f10de83 --- /dev/null +++ b/peft/src/peft/tuners/adalora/config.py @@ -0,0 +1,108 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass, field +from typing import Optional + +from peft.tuners.lora import LoraConfig +from peft.utils import PeftType + + +@dataclass +class AdaLoraConfig(LoraConfig): + """ + This is the configuration class to store the configuration of a [`~peft.AdaLora`]. + + AdaLoRA has three phases defined by `tinit`, `tfinal` and `total_step`. + + The initial phase can be understood as a step for pre-training the adapters so that when reducing their rank, there + is already some information encoded that can be reduced instead of random matrices. This phase is defined by + supplying `tinit`. + + After the initial phase is over (`tinit` steps have passed) and the final phase has not begun, AdaLoRA reduces the + budget of how much rank each layer is allowed to have with each step. This is where the reduction of rank is + happening. This goes on until `total_step - tfinal` steps are reached. + + The last phase, beginning once `total_step - tfinal` steps are reached, does not change the layer ranks anymore but + fine-tunes the reduced-rank layers that resulted from the previous phase. + + A practical example: `tinit` is 10, `tfinal` is 20, `total_step` is 100. We spend 10 steps doing pre-training + without rank reduction because our budget is constant (init phase), then we spend 80 (100-20) steps in the + reduction phase where our budget decreases step-wise and, finally, 20 steps in the final fine-tuning stage without + reduction. + + Args: + target_r (`int`): The target average rank of incremental matrix. + init_r (`int`): The initial rank for each incremental matrix. + tinit (`int`): The steps of initial fine-tuning warmup. + tfinal (`int`): The number of steps of final fine-tuning. + deltaT (`int`): The time internval between two budget allocations. + beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing. + beta2 (`float`): The hyperparameter of EMA for undertainty quantification. + orth_reg_weight (`float`): The coefficient of orthogonal regularization. + total_step (`int`): The total training steps that should be specified before training. + rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator. + """ + + target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."}) + init_r: int = field(default=12, metadata={"help": "Initial Lora matrix dimension."}) + tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."}) + tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."}) + deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."}) + beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."}) + beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."}) + orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."}) + total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."}) + rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."}) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.ADALORA + + if self.use_dora: + raise ValueError(f"{self.peft_type} does not support DoRA.") + + if self.loftq_config: + raise ValueError(f"{self.peft_type} does not support LOFTQ.") + + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + + # Check if 'r' has been set to a non-default value + if self.r != 8: # 8 is the default value for 'r' in LoraConfig + warnings.warn( + "Note that `r` is not used in AdaLora and will be ignored." + "If you intended to set the initial rank, use `init_r` instead." + ) + + if self.total_step is None or self.total_step <= 0: + raise ValueError("AdaLoRA does not work when `total_step` is None, supply a value > 0.") + + if self.tinit >= (self.total_step - self.tfinal): + raise ValueError( + "The supplied schedule values don't allow for a budgeting phase. Decrease `tfinal`/`tinit` or " + "increase `total_step`." + ) diff --git a/peft/src/peft/tuners/adalora/gptq.py b/peft/src/peft/tuners/adalora/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..bed1a0a7ca8dabb9d068bcf2470a97e34ec348fe --- /dev/null +++ b/peft/src/peft/tuners/adalora/gptq.py @@ -0,0 +1,71 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .layer import AdaLoraLayer + + +class SVDQuantLinear(torch.nn.Module, AdaLoraLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, torch.float32) + + output = (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum + # TODO: here, the dtype conversion is applied on the *whole expression*, + # not the intermediate result, unlike for SVDLinear8bitLT and + # SVDLinear4bit, is that correct? + if requires_conversion: + output = output.to(expected_dtype) + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep diff --git a/peft/src/peft/tuners/adalora/layer.py b/peft/src/peft/tuners/adalora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..635e5105515df70abac2af5b17174be8f1ed8052 --- /dev/null +++ b/peft/src/peft/tuners/adalora/layer.py @@ -0,0 +1,360 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, Optional + +import packaging +import torch +import transformers +from torch import nn + +from peft.tuners.lora import LoraLayer +from peft.tuners.tuners_utils import check_adapters_to_merge +from peft.utils import transpose + + +if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"): + from transformers.integrations import deepspeed_config +else: + from transformers.deepspeed import deepspeed_config + + +class AdaLoraLayer(LoraLayer): + # List all names of layers that may contain adapter weights + # Note: ranknum doesn't need to be included as it is not an nn.Module + adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout", "ranknum") + + def __init__(self, base_layer: nn.Module) -> None: + super().__init__(base_layer) + self.lora_E = nn.ParameterDict({}) + self.lora_A = nn.ParameterDict({}) + self.lora_B = nn.ParameterDict({}) + self.ranknum = nn.ParameterDict({}) + + def update_layer( + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, inference_mode: bool = False, **kwargs + ): + if r < 0: + # note: r == 0 is allowed for AdaLora, see #1539 + raise ValueError(f"`r` should be a positive integer or 0, but the value passed is {r}") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + # Right singular vectors + self.lora_A[adapter_name] = nn.Parameter(torch.randn(r, self.in_features)) + # Singular values + self.lora_E[adapter_name] = nn.Parameter(torch.randn(r, 1)) + # Left singular vectors + self.lora_B[adapter_name] = nn.Parameter(torch.randn(self.out_features, r)) + # The current rank + self.ranknum[adapter_name] = nn.Parameter(torch.randn(1), requires_grad=False) + self.ranknum[adapter_name].data.fill_(float(r)) + self.ranknum[adapter_name].requires_grad = False + self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r) + if init_lora_weights: + self.reset_lora_parameters(adapter_name) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_lora_parameters(self, adapter_name): + if adapter_name in self.lora_A.keys(): + nn.init.zeros_(self.lora_E[adapter_name]) + nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02) + nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02) + + +class SVDLinear(nn.Module, AdaLoraLayer): + # SVD-based adaptation by a dense layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + base_layer = self.get_base_layer() + if active_adapter in self.lora_A.keys(): + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + return ( + transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out) + * self.scaling[adapter] + / (self.ranknum[adapter] + 1e-5) + ) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + x = self._cast_input_dtype(x, lora_A.dtype) + result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep + + +class RankAllocator: + """ + The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY + + Args: + config ([`AdaLoraConfig`]): The configuration of the AdaLora model. + model: the model that we apply AdaLoRA to. + + """ + + def __init__(self, model, peft_config, adapter_name): + self.peft_config = peft_config + self.adapter_name = adapter_name + self.beta1 = peft_config.beta1 + self.beta2 = peft_config.beta2 + assert self.beta1 > 0 and self.beta1 < 1 + assert self.beta2 > 0 and self.beta2 < 1 + + self.reset_ipt() + self._set_budget_scheduler(model) + + def set_total_step(self, total_step): + self.peft_config.total_step = total_step + + def reset_ipt(self): + self.ipt = {} + self.exp_avg_ipt = {} + self.exp_avg_unc = {} + + def _set_budget_scheduler(self, model): + self.init_bgt = 0 + self.name_set = set() + for n, p in model.named_parameters(): + if f"lora_A.{self.adapter_name}" in n: + self.init_bgt += p.size(0) + self.name_set.add(n.replace("lora_A", "%s")) + self.name_set = sorted(self.name_set) + # The total final rank budget + self.target_bgt = self.peft_config.target_r * len(self.name_set) + + def budget_schedule(self, step: int): + tinit = self.peft_config.tinit + tfinal = self.peft_config.tfinal + total_step = self.peft_config.total_step + # Initial warmup + if step <= tinit: + budget = self.init_bgt + mask_ind = False + # Final fine-tuning + elif step > total_step - tfinal: + budget = self.target_bgt + mask_ind = True + else: + # Budget decreasing with a cubic scheduler + mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit) + budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt) + mask_ind = True if step % self.peft_config.deltaT == 0 else False + return budget, mask_ind + + def update_ipt(self, model): + # Update the sensitivity and uncertainty for every weight + for n, p in model.named_parameters(): + if "lora_" in n and self.adapter_name in n: + if n not in self.ipt: + self.ipt[n] = torch.zeros_like(p) + self.exp_avg_ipt[n] = torch.zeros_like(p) + self.exp_avg_unc[n] = torch.zeros_like(p) + with torch.no_grad(): + if deepspeed_config() is not None: + import deepspeed + + grad = deepspeed.utils.safe_get_full_grad(p) + self.ipt[n] = (p * grad).abs().detach() + else: + self.ipt[n] = (p * p.grad).abs().detach() + # Sensitivity smoothing + self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n] + # Uncertainty quantification + self.exp_avg_unc[n] = ( + self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs() + ) + + def _element_score(self, n): + return self.exp_avg_ipt[n] * self.exp_avg_unc[n] + + def _combine_ipt(self, ipt_E, ipt_AB): + ipt_AB = ipt_AB.sum(dim=1, keepdim=False) + sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1) + return sum_ipt + + def mask_to_budget(self, model, budget): + value_ipt = {} + vector_ipt = {} + triplet_ipt = {} + # Get the importance score for A, E, B + for n, p in model.named_parameters(): + if f"lora_A.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + comb_ipt = torch.mean(entry_ipt, dim=1, keepdim=True) + name_m = n.replace("lora_A", "%s") + if name_m not in vector_ipt: + vector_ipt[name_m] = [comb_ipt] + else: + vector_ipt[name_m].append(comb_ipt) + if f"lora_B.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + comb_ipt = torch.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1) + name_m = n.replace("lora_B", "%s") + if name_m not in vector_ipt: + vector_ipt[name_m] = [comb_ipt] + else: + vector_ipt[name_m].append(comb_ipt) + if f"lora_E.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + name_m = n.replace("lora_E", "%s") + value_ipt[name_m] = entry_ipt + + all_score = [] + # Calculate the score for each triplet + for name_m in vector_ipt: + ipt_E = value_ipt[name_m] + ipt_AB = torch.cat(vector_ipt[name_m], dim=1) + sum_ipt = self._combine_ipt(ipt_E, ipt_AB) + name_E = name_m % "lora_E" + triplet_ipt[name_E] = sum_ipt.view(-1, 1) + all_score.append(sum_ipt.view(-1)) + + # Get the threshold by ranking ipt + mask_threshold = torch.kthvalue( + torch.cat(all_score), + k=self.init_bgt - budget, + )[0].item() + + rank_pattern = {} + # Mask the unimportant triplets + with torch.no_grad(): + for n, p in model.named_parameters(): + if f"lora_E.{self.adapter_name}" in n: + p.masked_fill_(triplet_ipt[n] <= mask_threshold, 0.0) + rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).tolist() + return rank_pattern + + def update_and_allocate(self, model, global_step, force_mask=False): + # # Update the importance score and allocate the budget + if global_step < self.peft_config.total_step - self.peft_config.tfinal: + self.update_ipt(model) + budget, mask_ind = self.budget_schedule(global_step) + # Allocate the budget according to importance scores + if mask_ind or force_mask: + rank_pattern = self.mask_to_budget(model, budget) + else: + rank_pattern = None + return budget, rank_pattern + + def mask_using_rank_pattern(self, model, rank_pattern): + # Mask the unimportant triplets + is_adapter_name_truncated = False + if self.adapter_name not in next(iter(rank_pattern.keys())): + is_adapter_name_truncated = True + + with torch.no_grad(): + for n, p in model.named_parameters(): + if f"lora_E.{self.adapter_name}" in n: + key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "") + mask = torch.Tensor(rank_pattern[key]).unsqueeze(-1).to(p.device) + p.masked_fill_(~mask.bool(), 0.0) diff --git a/peft/src/peft/tuners/adalora/model.py b/peft/src/peft/tuners/adalora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c5c345c0ef8f5f99b1bff8a65619ad4f16c9c449 --- /dev/null +++ b/peft/src/peft/tuners/adalora/model.py @@ -0,0 +1,346 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import torch +from transformers.pytorch_utils import Conv1D + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_gptqmodel_available +from peft.tuners.lora import LoraConfig, LoraModel +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + _freeze_adapter, + _get_submodules, + get_auto_gptq_quant_linear, + get_gptqmodel_quant_linear, + get_quantization_config, +) +from peft.utils.integrations import gather_params_ctx + +from .gptq import SVDQuantLinear +from .layer import AdaLoraLayer, RankAllocator, SVDLinear + + +class AdaLoraModel(LoraModel): + """ + Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper: + https://openreview.net/forum?id=lq62uWRJjiY + + Args: + model ([`transformers.PreTrainedModel`]): The model to be adapted. + config ([`AdaLoraConfig`]): The configuration of the AdaLora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The AdaLora model. + + Example:: + + >>> from transformers import AutoModelForSeq2SeqLM >>> from peft import LoraConfig, AdaLoraModel, AdaLoraConfig + >>> config = AdaLoraConfig( + peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", init_r=12, lora_alpha=32, target_modules=["q", "v"], + lora_dropout=0.01, + ) + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default") + + **Attributes**: + - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model. + """ + + # Note: don't redefine prefix or tuner_layer_cls here, it should be inherited from LoraModel + target_module_mapping = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING + + def __init__(self, model, config, adapter_name, **kwargs): + super().__init__(model, config, adapter_name, **kwargs) + + traininable_mode_counter = 0 + for config in self.peft_config.values(): + if not config.inference_mode: + traininable_mode_counter += 1 + + if traininable_mode_counter > 1: + raise ValueError( + "AdaLoraModel supports only 1 trainable adapter. " + "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train." + ) + + if self.peft_config[adapter_name].inference_mode: + _freeze_adapter(self.model, adapter_name) + else: + self.trainable_adapter_name = adapter_name + self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name) + + def _check_new_adapter_config(self, config: LoraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + super()._check_new_adapter_config(config) + + traininable_mode_counter = 0 + for config_ in self.peft_config.values(): + if not config_.inference_mode: + traininable_mode_counter += 1 + + if traininable_mode_counter > 1: + raise ValueError( + f"{self.__class__.__name__} supports only 1 trainable adapter. " + "When using multiple adapters, set inference_mode to True for all adapters except the one " + "you want to train." + ) + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + kwargs = { + "r": lora_config.init_r, + "lora_alpha": lora_config.lora_alpha, + "lora_dropout": lora_config.lora_dropout, + "fan_in_fan_out": lora_config.fan_in_fan_out, + "init_lora_weights": lora_config.init_lora_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + if (kwargs["loaded_in_8bit"] or kwargs["loaded_in_4bit"]) and not is_bnb_available(): + raise ImportError( + "To use AdaLora with 8-bit quantization, please install the `bitsandbytes` package. " + "You can install it with `pip install bitsandbytes`." + ) + + quantization_config = get_quantization_config(self.model, method="gptq") + if quantization_config is not None: + kwargs["gptq_quantization_config"] = quantization_config + + # If it is not an AdaLoraLayer, create a new module, else update it with new adapters + if not isinstance(target, AdaLoraLayer): + device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None + new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + lora_config.init_r, + lora_config.lora_alpha, + lora_config.lora_dropout, + lora_config.init_lora_weights, + ) + + @staticmethod + def _create_new_module(lora_config, adapter_name, target, device_map=None, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import SVDLinear8bitLt + if is_bnb_4bit_available(): + from .bnb import SVDLinear4bit + + gptq_quantization_config = kwargs.get("gptq_quantization_config", None) + + if is_gptqmodel_available(): + QuantLinear = get_gptqmodel_quant_linear(gptq_quantization_config, device_map=device_map) + else: + QuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config) + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + new_module = SVDLinear8bitLt(target, adapter_name, **kwargs) + elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = SVDLinear4bit(target, adapter_name, **fourbit_kwargs) + elif QuantLinear is not None and isinstance(target, QuantLinear): + new_module = SVDQuantLinear(target, adapter_name, **kwargs) + else: + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. " + "Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. " + f"Currently, only `torch.nn.Linear` and `Conv1D` are supported." + ) + new_module = SVDLinear(target, adapter_name, **kwargs) + + return new_module + + def forward(self, *args, **kwargs): + outputs = self.model.forward(*args, **kwargs) + + if (getattr(outputs, "loss", None) is not None) and isinstance(outputs.loss, torch.Tensor): + # Calculate the orthogonal regularization + orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight + + if orth_reg_weight <= 0: + raise ValueError("orth_reg_weight should be greater than 0. ") + + regu_loss = 0 + num_param = 0 + for n, p in self.model.named_parameters(): + if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n: + if p.shape == torch.Size([0]): + with gather_params_ctx(p, fwd_module=self): + para_cov = p @ p.T if "lora_A" in n else p.T @ p + else: + para_cov = p @ p.T if "lora_A" in n else p.T @ p + I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov)) # noqa: E741 + I.requires_grad = False + num_param += 1 + regu_loss += torch.norm(para_cov - I, p="fro") + if num_param > 0: + regu_loss = regu_loss / num_param + else: + regu_loss = 0 + outputs.loss += orth_reg_weight * regu_loss + return outputs + + def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name): + lora_config = self.peft_config[adapter_name] + for name, rank_idx in rank_pattern.items(): + if isinstance(rank_idx, list): + rank = sum(rank_idx) + elif isinstance(rank_idx, torch.Tensor): + rank_idx = rank_idx.view(-1) + rank = rank_idx.sum().item() + else: + raise ValueError("Unexpected type of rank_idx") + key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1]) + _, target, _ = _get_submodules(self.model, key) + lora_E_weights = target.lora_E[adapter_name][rank_idx] + lora_A_weights = target.lora_A[adapter_name][rank_idx] + lora_B_weights = target.lora_B[adapter_name][:, rank_idx] + ranknum = target.ranknum[adapter_name] + target.update_layer( + adapter_name, + rank, + lora_config.lora_alpha, + lora_config.lora_dropout, + lora_config.init_lora_weights, + ) + with torch.no_grad(): + if rank > 0: + target.lora_E[adapter_name].copy_(lora_E_weights) + target.lora_A[adapter_name].copy_(lora_A_weights) + target.lora_B[adapter_name].copy_(lora_B_weights) + # The scaling is exactly as the previous + target.ranknum[adapter_name].copy_(ranknum) + + def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name): + for name, rank_idx in rank_pattern.items(): + rank = sum(rank_idx) + prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1]) + for layer in ["lora_E", "lora_A", "lora_B"]: + key = f"base_model.model.{prefix}.{layer}.{adapter_name}" + if layer != "lora_B": + state_dict[key] = ( + state_dict[key][rank_idx] if rank != state_dict[key].shape[0] else state_dict[key] + ) + else: + state_dict[key] = ( + state_dict[key][:, rank_idx] if rank != state_dict[key].shape[1] else state_dict[key] + ) + return state_dict + + def update_and_allocate(self, global_step): + """ + This method updates Adalora budget and mask. + + This should be called in every training step after `loss.backward()` and before `zero_grad()`. + + `tinit`, `tfinal` and `deltaT` are handled with in the method. + + Args: + global_step (`int`): The current training step, it is used to calculate adalora budget. + + Example: + + ```python + >>> loss = model(**input).loss + >>> loss.backward() + >>> optimizer.step() + >>> model.base_model.update_and_allocate(i_step) + >>> optimizer.zero_grad() + ``` + """ + lora_config = self.peft_config[self.trainable_adapter_name] + # Update the importance score and allocate the budget + if global_step < lora_config.total_step - lora_config.tfinal: + _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step) + if rank_pattern: + lora_config.rank_pattern = rank_pattern + # Finalize the budget allocation + elif global_step == lora_config.total_step - lora_config.tfinal: + _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True) + # for some reason, this freezes the trainable parameters and nothing gets updates + # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name) + lora_config.rank_pattern = rank_pattern + self.rankallocator.reset_ipt() + # Currently using inefficient way to mask the unimportant weights using the rank pattern + # due to problem mentioned above + elif global_step > lora_config.total_step - lora_config.tfinal: + self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern) + # Pass the function and do forward propagation + else: + return None + + def add_weighted_adapter(self, *args, **kwargs): + """This method is not supported for AdaLoRA, use LoRA instead.""" + raise TypeError(f"{self.__class__.__name__} does not support add_weighted_adapter method.") diff --git a/peft/src/peft/tuners/adaption_prompt/__init__.py b/peft/src/peft/tuners/adaption_prompt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..68882a222625eedc770d28816a3a0646964739bb --- /dev/null +++ b/peft/src/peft/tuners/adaption_prompt/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from peft.utils import register_peft_method + +from .config import AdaptionPromptConfig +from .layer import AdaptedAttention +from .model import AdaptionPromptModel + + +__all__ = ["AdaptedAttention", "AdaptionPromptConfig", "AdaptionPromptModel"] + +register_peft_method(name="adaption_prompt", config_cls=AdaptionPromptConfig, model_cls=AdaptionPromptModel) diff --git a/peft/src/peft/tuners/adaption_prompt/config.py b/peft/src/peft/tuners/adaption_prompt/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a9f780383d425fb551e2635017de9eaac3c2b81 --- /dev/null +++ b/peft/src/peft/tuners/adaption_prompt/config.py @@ -0,0 +1,88 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +from dataclasses import dataclass, field + +from peft.config import PeftConfig +from peft.utils import PeftType + +from .utils import gpt2_compute_query_states, llama_compute_query_states + + +@dataclass +class AdaptionPromptConfig(PeftConfig): + """Stores the configuration of an [`AdaptionPromptModel`].""" + + target_modules: str = field( + default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."} + ) + adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"}) + adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"}) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.ADAPTION_PROMPT + + @property + def is_adaption_prompt(self) -> bool: + """Return True if this is an adaption prompt config.""" + return True + + +# Contains the config that is specific to a transformers model type. +ModelTypeConfig = namedtuple( + "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"] +) + +# Mapping of transformers model types to their specific configuration. +TRANSFORMERS_MODEL_CONFIG = { + "llama": ModelTypeConfig( + compute_query_states=llama_compute_query_states, + target_modules="self_attn", + k_proj_layer="k_proj", + v_proj_layer="v_proj", + o_proj_layer="o_proj", + ), + "mistral": ModelTypeConfig( # same as llama, + compute_query_states=llama_compute_query_states, + target_modules="self_attn", + k_proj_layer="k_proj", + v_proj_layer="v_proj", + o_proj_layer="o_proj", + ), + "gpt2": ModelTypeConfig( # piggybacking of off the prior definitions, GPTs attention calculation is different + compute_query_states=gpt2_compute_query_states, + target_modules="attn", + k_proj_layer="c_attn", + v_proj_layer=None, + o_proj_layer=None, + ), +} + + +def prepare_config( + peft_config: AdaptionPromptConfig, + model, +) -> AdaptionPromptConfig: + """Prepare the config based on the llama model type.""" + if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG: + raise ValueError(f"Unsupported model type for adaption prompt: '{model.config.model_type}'.") + + model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type] + + if peft_config.target_modules is None: + peft_config.target_modules = model_config.target_modules + + return peft_config diff --git a/peft/src/peft/tuners/adaption_prompt/layer.py b/peft/src/peft/tuners/adaption_prompt/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..4dd877841f176b1688d5c0fc224c79f3b089ceff --- /dev/null +++ b/peft/src/peft/tuners/adaption_prompt/layer.py @@ -0,0 +1,236 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .config import TRANSFORMERS_MODEL_CONFIG + + +class _BaseAdaptedAttention(nn.Module): + """Base module, which defines adaption prompts for multiple model types.""" + + def __init__(self, model_type: str, adapter_len: int, model, target_dtype=torch.float32): + """ + Initialize object. + + Args: + model_type: The transformer model type. This is used to retrieve the right method to + compute query states. + adapter_len: The length of the adaption prompt to insert. + model: The original transformer attention module that is being wrapped. + """ + if isinstance(model, _BaseAdaptedAttention): + raise ValueError("Unable to stack multiple adaption prompts") + super().__init__() + self.model_type = model_type + self.model = model + self.adapter_len = adapter_len + # Assume all parameters of the attention model we are wrapping are on the same device. + + device = next(model.parameters()).device + # Don't think this was specified in the paper, but we follow the official repo which used an Embedding + # which initializes the tokens with standard normal values. + # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234 + # (bsz, adapter_len, hidden_size) + + if hasattr(self.model, "hidden_size"): + # TODO: remove this clause after 2026-01-01 + hidden_size = self.model.hidden_size + else: # changed in https://github.com/huggingface/transformers/pull/35235 + hidden_size = self.model.config.hidden_size + + if hasattr(self.model, "num_heads"): + # TODO: remove this clause after 2026-01-01 + self.num_heads = self.model.num_heads + else: # changed in https://github.com/huggingface/transformers/pull/35235 + self.num_heads = self.model.config.num_attention_heads + + self.adaption_prompt = nn.Parameter( + torch.empty(1, adapter_len, hidden_size, device=device, dtype=target_dtype).normal_() + ) + # Initialize the gate to 0 as this is "zero-init". + self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype)) + + +class AdaptedAttentionGPT(_BaseAdaptedAttention): + """This module wraps a GPT2Attention module and injects adaption prompts""" + + def __init__(self, model_type, adapter_len, model): + target_dtype = ( + model.c_proj.weight.dtype if model.c_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32 + ) + super().__init__(model_type, adapter_len, model, target_dtype=target_dtype) + + def forward( + self, + hidden_states: Optional[tuple[torch.FloatTensor]], + layer_past: Optional[tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]: + attn_outputs = self.model( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs, + ) + """ + Forward pass for the adapter which wraps the GPT2Attention module + """ + + attn_output = attn_outputs[0] + add_outputs = attn_outputs[1:] + + c_attn_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer + + bsz = attn_output.shape[0] + q_len = attn_output.shape[1] + embed_dim = attn_output.shape[2] + + _, key, value = getattr(self.model, c_attn_layer)(self.adaption_prompt).split(embed_dim, dim=2) + + adapter_k = ( + key.view(1, self.adapter_len, self.num_heads, self.model.head_dim).repeat(bsz, 1, 1, 1).transpose(1, 2) + ) + adapter_v = ( + value.view(1, self.adapter_len, self.num_heads, self.model.head_dim).repeat(bsz, 1, 1, 1).transpose(1, 2) + ) + # recompute query state since it is not returned by GPT2 forward + compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states + query_states = compute_query_states( + self.model, hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states + ) + + previous_dtype = query_states.dtype + + scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt( + self.model.head_dim + ) + # Upcast attention to fp32 + # (bsz, num_heads, q_len, adapter_len) + scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype) + # (bsz, q_len, num_heads * head_dim) + adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1) + + # Add adaption prompt output to original output. + hidden_state = attn_output + adapter_output + + # Restore original dtype. + hidden_state = hidden_state.to(previous_dtype) + + # add additional attention outputs (attention and cross attention) + output = (hidden_state,) + add_outputs + return output + + +class AdaptedAttention(_BaseAdaptedAttention): + """This module wraps a LLamaAttention module and injects adaption prompts.""" + + def __init__(self, model_type, adapter_len, model): + target_dtype = ( + model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32 + ) + super().__init__(model_type, adapter_len, model, target_dtype=target_dtype) + + def forward(self, **kwargs): + """ + Forward pass for the adapter which wraps the original LlamaAttention module. + + "Official" paper implementation: + https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L141 + + Args: + kwargs: See the original LlamaAttention module. + """ + if kwargs.get("output_attention", False): + raise NotImplementedError("output_attention is not currently supported.") + + output, *_ = self.model(**kwargs) + bsz = output.shape[0] + q_len = output.shape[1] + embed_dim = output.shape[2] + k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer + v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer + o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer + factor = ( + self.model.k_proj.in_features // self.model.k_proj.out_features + ) # Mistral has different input and output dimension for k_proj and v_proj layers + + if k_proj_layer == v_proj_layer: + _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, dim=2) + else: + key = getattr(self.model, k_proj_layer)(self.adaption_prompt) + value = getattr(self.model, v_proj_layer)(self.adaption_prompt) + + if hasattr(self.model, "num_heads"): + # TODO: remove this clause after 2026-01-01 + num_heads = self.model.num_heads + else: # changed in https://github.com/huggingface/transformers/pull/35235 + num_heads = self.model.config.num_attention_heads + # (bsz, num_key_value_heads, adapter_len, head_dim) + adapter_k = ( + key.view(1, self.adapter_len, (num_heads // factor), self.model.head_dim) + .repeat(bsz, 1, 1, 1) + .transpose(1, 2) + ) + adapter_v = ( + value.view(1, self.adapter_len, (num_heads // factor), self.model.head_dim) + .repeat(bsz, 1, 1, 1) + .transpose(1, 2) + ) + # Below is taken from https://github.com/huggingface/transformers/blob/e547458c43dfdbbb8f6a7757237e234c44e20a8f/src/transformers/models/mistral/modeling_mistral.py#L181 + # (bsz, num_heads, adapter_len, head_dim) + adapter_k = torch.repeat_interleave(adapter_k, repeats=factor, dim=1) + adapter_v = torch.repeat_interleave(adapter_v, repeats=factor, dim=1) + # Recompute query states. + compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states + # (bsz, num_heads, q_len, head_dim) + query_states = compute_query_states(model=self.model, **kwargs) + + previous_dtype = query_states.dtype + + # (bsz, num_heads, q_len, adapter_len) + scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt( + self.model.head_dim + ) + # Upcast attention to fp32 + # (bsz, num_heads, q_len, adapter_len) + scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype) + # (bsz, q_len, num_heads * head_dim) + adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1) + + # (bsz, q_len, hidden_size) + if o_proj_layer is not None: + adapter_output = getattr(self.model, o_proj_layer)(adapter_output) + + # Add adaption prompt output to original output. + output = output + adapter_output + + # Restore original dtype. + output = output.to(previous_dtype) + return output, *_ diff --git a/peft/src/peft/tuners/adaption_prompt/model.py b/peft/src/peft/tuners/adaption_prompt/model.py new file mode 100644 index 0000000000000000000000000000000000000000..6b91c06fc0d5ad20789c4fdc6f2b76ffb4cb114e --- /dev/null +++ b/peft/src/peft/tuners/adaption_prompt/model.py @@ -0,0 +1,169 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch.nn as nn + +from peft.utils import _freeze_adapter, _get_submodules + +from .config import AdaptionPromptConfig, prepare_config +from .layer import AdaptedAttention, AdaptedAttentionGPT +from .utils import is_adaption_prompt_trainable + + +class AdaptionPromptModel(nn.Module): + """ + Implements adaption prompts as described in https://huggingface.co/papers/2303.16199. + + The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert + trainable prompts with gates (for zero init). + + Notes on the multi-adapter pattern: + - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter + name. + - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them + in the dictionary, and replace them with the modules of the new adapter. + - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the + dictionary. + - Disabling the adapter would also result in the modules being removed from the model. + """ + + def __init__(self, model, configs: dict, adapter_name: str): + super().__init__() + self.model = model + # Store adapter configs by name. + self.peft_config: dict[str, AdaptionPromptConfig] = {} + # Store lists of the parents of the affected attention modules by adapter name. + # We keep references to the parents so we can swap the adapters in-and-out of the model. + self._parents: dict[str, list[nn.Module]] = {} + # Store lists of cached AdaptedAttention modules by name. + self._cached_adapters: dict[str, list] = {} + # The name of the currently active adapter. + self._active_adapter = None + # Whether the adapter is enabled. + self._enabled = True + self.forward = self.model.forward + self.add_adapter(adapter_name, configs[adapter_name]) + self._mark_only_adaption_prompts_as_trainable(self.model) + + def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None: + """Add an adapter with the given name and config.""" + config = prepare_config(config, self.model) + if adapter_name in self.peft_config: + raise ValueError(f"Adapter with name '{adapter_name}' already exists.") + + parents = [] + for name, _ in self.model.named_modules(): + if name.endswith(f".{config.target_modules}"): + par, _, _ = _get_submodules(self.model, name) + parents.append(par) + if len(parents) < config.adapter_layers: + raise ValueError( + f"Config specifies more adapter layers '{config.adapter_layers}' than the model has '{len(parents)}'." + ) + # Note that if the target modules are not in Sequential, ModuleList, or + # some other PyTorch ordered container, the behavior is undefined as we + # assume here that the order of the modules is the same as the order of + # the transformer decoder layers. + parents = parents[-config.adapter_layers :] + self._parents[adapter_name] = parents + + # It is only None during initialization. + # If it is disabled, we don't have to remove the modules. + if self._active_adapter is not None and self._enabled: + self._remove_adapted_attentions(self._active_adapter) + self._active_adapter = adapter_name + self.peft_config[adapter_name] = config + self._create_adapted_attentions(config, parents) + if not self._enabled: + self._remove_adapted_attentions(self._active_adapter) + + if config.inference_mode: + _freeze_adapter(self.model, adapter_name) + + def set_adapter(self, adapter_name: str) -> None: + """Set the model to use the adapter with the given name.""" + if self._active_adapter == adapter_name: + return + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter with name '{adapter_name}' does not exist.") + + if self._enabled: + self._remove_adapted_attentions(self._active_adapter) + self._set_adapted_attentions(adapter_name) + + self._active_adapter = adapter_name + + def enable_adapter_layers(self): + """Enable adapter layers by swapping in cached AdaptedAttention modules.""" + self._enabled = True + self._set_adapted_attentions(self._active_adapter) + + def disable_adapter_layers(self): + """Disable adapter layers by swapping out AdaptedAttention modules.""" + self._enabled = False + self._remove_adapted_attentions(self._active_adapter) + + def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: list[nn.Module]) -> None: + """Wrap LlamaAttention modules with newly created AdaptedAttention modules.""" + for par in parents: + if self.model.config.model_type == "gpt2": + attn = AdaptedAttentionGPT( + model_type=self.model.config.model_type, + adapter_len=config.adapter_len, + model=getattr(par, config.target_modules), + ) + + else: + attn = AdaptedAttention( + model_type=self.model.config.model_type, + adapter_len=config.adapter_len, + model=getattr(par, config.target_modules), + ) + setattr(par, config.target_modules, attn) + + def _set_adapted_attentions(self, adapter_name: str) -> None: + """Replace LlamaAttention modules with cached AdaptedAttention modules.""" + cached = self._cached_adapters[adapter_name] + del self._cached_adapters[adapter_name] + config = self.peft_config[adapter_name] + for i, par in enumerate(self._parents[adapter_name]): + setattr(par, config.target_modules, cached[i]) + + def _remove_adapted_attentions(self, adapter_name: str) -> None: + """Remove AdaptedAttention modules from the model and store them in the cache.""" + config = self.peft_config[adapter_name] + adapted_attentions = [] + for par in self._parents[adapter_name]: + attn = getattr(par, config.target_modules) + adapted_attentions.append(attn) + setattr(par, config.target_modules, attn.model) + self._cached_adapters[adapter_name] = adapted_attentions + + def _mark_only_adaption_prompts_as_trainable(self, model: nn.Module) -> None: + """Freeze all parameters of the model except the adaption prompts.""" + for n, p in model.named_parameters(): + if not is_adaption_prompt_trainable(n): + p.requires_grad = False + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + # This is necessary as e.g. causal models have various methods that we + # don't want to re-implement here. + if name == "model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.model, name) diff --git a/peft/src/peft/tuners/adaption_prompt/utils.py b/peft/src/peft/tuners/adaption_prompt/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..967dabf47c2af6973c4b3c8a0ba67cb957c0fceb --- /dev/null +++ b/peft/src/peft/tuners/adaption_prompt/utils.py @@ -0,0 +1,158 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +from typing import Optional + +import torch +import torch.nn as nn + + +def llama_rotate_half(x: torch.Tensor) -> torch.Tensor: + """ + Rotate half the hidden dims of the input. + + This function was duplicated verbatim from: + https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126 + + This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other + functions were also adapted from the transformers implementation but were modified. + """ + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def llama_apply_rotary_pos_emb(q, cos, sin, position_ids): + """ + Apply rotary position embedding to query states in the Llama model. + + This function was adapted from: + https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133 + + It was modified to remove unnecessary processing of key states. The method is compatible with transformers <= + 4.34.2 and also with the latest version (>=4.35). + """ + # In previous transformers version cos/sin cached had a shape of 4D + if len(cos.shape) == 4: + gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1] + gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3]) + cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + # In the new version, it is 2D so we fall back to the new implementation + # https://github.com/huggingface/transformers/blame/eef7ea98c31a333bacdc7ae7a2372bde772be8e4/src/transformers/models/llama/modeling_llama.py#L222-L226 + else: + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (llama_rotate_half(q) * sin) + return q_embed + + +def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor: + """ + Compute query states for Llama models specifically. They need to be recomputed as the forward() method of the + original LlamaModel in the transformers library does not return them. See the related discussion in the PR: + https://github.com/huggingface/peft/pull/268 + """ + hidden_states = kwargs.get("hidden_states") + position_ids = kwargs.get("position_ids") + past_key_value = kwargs.get("past_key_value") + bsz, q_len, _ = hidden_states.size() + if hasattr(model, "num_heads"): + # TODO: remove this clause after 2026-01-01 + num_heads = model.num_heads + else: # changed in https://github.com/huggingface/transformers/pull/35235 + num_heads = model.config.num_attention_heads + query_states = model.q_proj(hidden_states).view(bsz, q_len, num_heads, model.head_dim).transpose(1, 2) + + factor = model.k_proj.in_features // model.k_proj.out_features + value_states = model.v_proj(hidden_states).view(bsz, q_len, (num_heads // factor), model.head_dim).transpose(1, 2) + + seq_len = q_len + + if past_key_value is not None: + if isinstance(past_key_value, tuple): + # for transformers <= 4.35 + seq_len += past_key_value[0].shape[-2] + else: + # since transformers 4.36, this is a DynamicCache instance + seq_len += past_key_value.get_seq_length(model.layer_idx) + + # model.rotary_emb is deprecated and will be removed in transformers > 4.47.0. Instead, the position embeddings are + # passed via the kwargs + if "position_embeddings" in kwargs: + cos, sin = kwargs["position_embeddings"] + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + return (query_states * cos) + (llama_rotate_half(query_states) * sin) + + # For transformers > 4.37.2 `position_ids` became a required arguments in the rotary embedding's forward pass. + if "position_ids" not in inspect.signature(model.rotary_emb.forward).parameters: + # TODO we assume that position_ids is not None here, not sure if that is safe but the old code also did that + cos, sin = model.rotary_emb(value_states, seq_len=seq_len) + return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids) + + past_seen_tokens = 0 + if position_ids is None: + # Compute position_ids, since they are required for transformers > 4.37.2 + if past_key_value is None: + new_cache_positions = torch.arange(q_len, q_len + q_len, device=value_states.device) + else: + past_seen_tokens = past_key_value.get_usable_length(q_len, model.layer_idx) + new_cache_positions = torch.arange(past_seen_tokens, past_seen_tokens + q_len, device=value_states.device) + position_ids = new_cache_positions.unsqueeze(0) + + rotary_emb_kwargs = {"position_ids": position_ids} + # The `seq_len` argument has been officially removed in transformers >= 4.39.0 + if "seq_len" in inspect.signature(model.rotary_emb.forward).parameters: + rotary_emb_kwargs["seq_len"] = q_len + past_seen_tokens + + cos, sin = model.rotary_emb(value_states, **rotary_emb_kwargs) + + # For batched inference unsqueeze it on the correct dim + # since: https://github.com/huggingface/transformers/pull/29109 + if len(cos.shape) == 3: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + + return (query_states * cos) + (llama_rotate_half(query_states) * sin) + + +def gpt2_compute_query_states( + model: nn.Module, + hidden_states: Optional[tuple[torch.FloatTensor]], + encoder_hidden_states: Optional[torch.Tensor] = None, +) -> torch.Tensor: + """ + Compute query states for GPT2 models. They need to be recomputed as the forward() method of the GPT@ in the + transformers library does not return them. See the related discussion in the PR: + """ + if encoder_hidden_states is not None: + if not hasattr(model, "q_attn"): + raise ValueError( + f"If `{model.__class__.__name__}` is used as cross attention, the weights `q_attn` must be defined. " + f"Please make sure to instantiate it with `GPT2Attention(..., is_cross_attention=True)`." + ) + query_states = model.q_attn(hidden_states) + else: + query_states, _, _ = model.c_attn(hidden_states).split(model.split_size, dim=2) + + shape_q = (*query_states.shape[:-1], -1, model.head_dim) + query_states = query_states.view(shape_q).transpose(1, 2) + + return query_states + + +def is_adaption_prompt_trainable(params: str) -> bool: + """Return True if module is trainable under adaption prompt fine-tuning.""" + return params.split(".")[-1].startswith("adaption_") diff --git a/peft/src/peft/tuners/boft/__init__.py b/peft/src/peft/tuners/boft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c84b8358da8fc52d3f332c195f22b8b9620c665f --- /dev/null +++ b/peft/src/peft/tuners/boft/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import BOFTConfig +from .layer import BOFTLayer +from .model import BOFTModel + + +__all__ = ["BOFTConfig", "BOFTLayer", "BOFTModel"] + +register_peft_method(name="boft", config_cls=BOFTConfig, model_cls=BOFTModel) diff --git a/peft/src/peft/tuners/boft/config.py b/peft/src/peft/tuners/boft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1715cc5bc65c190d3ba4dd160707d3fb0c93307a --- /dev/null +++ b/peft/src/peft/tuners/boft/config.py @@ -0,0 +1,160 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class BOFTConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`BOFTModel`]. + + Args: + boft_block_size (`int`): BOFT block size across different layers. + boft_block_num (`int`): Number of BOFT blocks per injected layer. + boft_n_butterfly_factor (`int`): Number of butterfly factors across different layers. + target_modules (`Union[List[str],str]`): The names of the modules to apply the adapter to. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + boft_dropout (`float`): + The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the + dropout layer in LoRA. + fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out). + For example, gpt-2 uses `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set + to `True`. + bias (`str`): Bias type for BOFT. Can be 'none', 'all' or 'boft_only'. If 'all' or 'boft_only', the + corresponding biases will be updated during training. Be aware that this means that, even when disabling + the adapters, the model will not produce the same output as the base model would have without adaptation. + modules_to_save (`List[str]`):List of modules apart from BOFT layers to be set as trainable + and saved in the final checkpoint. + layers_to_transform (`Union[List[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply the BOFT transformations on + the layer indexes that are specified in this list. If a single integer is passed, it will apply the BOFT + transformations on the layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer + pattern is not in the common layers pattern. This should target the `nn.ModuleList` of the model, which is + often called `'layers'` or `'h'`. + """ + + boft_block_size: int = field( + default=4, + metadata={ + "help": "BOFT block size across different layers.", + "note": "You can only specify either boft_block_size or boft_block_num, but not both simultaneously, because boft_block_size x boft_block_num = layer dimension.", + }, + ) + boft_block_num: int = field( + default=0, + metadata={ + "help": "Number of BOFT blocks per injected layer.", + "note": "You can only specify either boft_block_size or boft_block_num, but not both simultaneously, because boft_block_size x boft_block_num = layer dimension.", + }, + ) + boft_n_butterfly_factor: int = field( + default=1, + metadata={ + "help": "Number of butterfly factors.", + "note": ( + "for example, boft_n_butterfly_factor=2, the effective block size of OFT becomes twice as big and the number of blocks become half.", + "note: for boft_n_butterfly_factor=1, BOFT is the same as vanilla OFT.", + ), + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with BOFT.", + "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ", + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from BOFT."}, + ) + boft_dropout: float = field( + default=0.0, + metadata={ + "help": "BOFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA." + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field(default="none", metadata={"help": "Bias type for BOFT. Can be 'none', 'all' or 'boft_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from BOFT layers to be set as trainable and saved in the final checkpoint. ", + "note": ( + "For example, in Sequence Classification or Token Classification tasks, ", + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved.", + ), + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the BOFT layers with their default initialization. Don't change ", + "this setting, except if you know exactly what you're doing.", + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.BOFT + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + if self.boft_block_size == 0 and self.boft_block_num == 0: + raise ValueError( + f"Either `boft_block_size` or `boft_block_num` must be non-zero. Currently, boft_block_size = {self.boft_block_size} and boft_block_num = {self.boft_block_num}." + ) + if not (self.boft_block_size != 0) ^ (self.boft_block_num != 0): + raise ValueError( + f"You can only specify either boft_block_size ({self.boft_block_size}) or boft_block_num ({self.boft_block_num}), but not both simultaneously, because boft_block_size x boft_block_num == in_features." + ) diff --git a/peft/src/peft/tuners/boft/fbd/__init__.py b/peft/src/peft/tuners/boft/fbd/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/src/peft/tuners/boft/fbd/fbd_cuda.cpp b/peft/src/peft/tuners/boft/fbd/fbd_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d63111b04083f3b1359aba0a9de9656060a4d515 --- /dev/null +++ b/peft/src/peft/tuners/boft/fbd/fbd_cuda.cpp @@ -0,0 +1,28 @@ +#include +#include +#include +#include + +std::vector forward_fast_block_diag_cuda( + at::Tensor input); + +std::vector forward_fast_block_diag( + at::Tensor input + ) { + return forward_fast_block_diag_cuda(input); +} + +std::vector backward_fast_block_diag_cuda( + at::Tensor grad_output, + at::Tensor input); +std::vector backward_fast_block_diag( + at::Tensor grad_output, + at::Tensor input + ) { + return backward_fast_block_diag_cuda(grad_output, input); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &forward_fast_block_diag, "FAST BLOCK DIAG (CUDA)"); + m.def("backward", &backward_fast_block_diag, "FAST BLOCK DIAG backward (CUDA)"); +} diff --git a/peft/src/peft/tuners/boft/fbd/fbd_cuda_kernel.cu b/peft/src/peft/tuners/boft/fbd/fbd_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..9f307455349e6ea8067d47ebf8a39b98ba452d09 --- /dev/null +++ b/peft/src/peft/tuners/boft/fbd/fbd_cuda_kernel.cu @@ -0,0 +1,109 @@ +// Author: Yao Feng +// Date: 2023/08 +// Description: cuda kernel for fast block diag + +#include + +#include +#include +#include + +namespace{ +template +__global__ void forward_fast_block_diag_cuda_kernel( + const scalar_t* __restrict__ input, //[z, N, b, b] + scalar_t* output, //[z, Nxb, Nxb] + int z, int N, int b + ) { + + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= z*N*b*b) { + return; + } + const int zi = i/(N*b*b); + const int Ni = (i%(N*b*b))/(b*b); + const int x = ((i%(N*b*b))%(b*b))/b; + const int y = ((i%(N*b*b))%(b*b))%b; + + output[zi*N*b*N*b + (Ni*b+x)*N*b + Ni*b + y] = input[zi*N*b*b + Ni*b*b + x*b + y]; + +} + +template +__global__ void backward_fast_block_diag_cuda_kernel( + const scalar_t* __restrict__ grad_output, + scalar_t* grad_input, + int z, int N, int b + ) { + + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= z*N*b*b) { + return; + } + const int zi = i/(N*b*b); + const int Ni = (i%(N*b*b))/(b*b); + const int x = ((i%(N*b*b))%(b*b))/b; + const int y = ((i%(N*b*b))%(b*b))%b; + + grad_input[zi*N*b*b + Ni*b*b + x*b + y] = grad_output[zi*N*b*N*b + (Ni*b+x)*N*b + Ni*b + y]; + +} // namespace +} + +std::vector forward_fast_block_diag_cuda( + at::Tensor input + ){ + const auto z = input.size(0); + const auto N = input.size(1); + const auto b = input.size(2); + + // print(channel_size) + const int threads = 512; + const dim3 blocks_1 ((z*N*b*b - 1) / threads +1); + // initlaize output + auto output = at::zeros({z, N*b, N*b}, input.options()); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "forward_fast_block_diag1", ([&] { + forward_fast_block_diag_cuda_kernel<<>>( + input.data_ptr(), + output.data_ptr(), + z, N, b); + })); + + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + printf("Error in forward_fast_block_diag_cuda_kernel: %s\n", cudaGetErrorString(err)); + + return {output}; +} + +std::vector backward_fast_block_diag_cuda( + at::Tensor grad_output, + at::Tensor input + ){ + + const auto z = input.size(0); + const auto N = input.size(1); + const auto b = input.size(2); + + // print(channel_size) + const int threads = 512; + const dim3 blocks_1 ((z*N*b*b - 1) / threads +1); + + // initialize grad input + auto grad_input = at::zeros_like(input); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_output.type(), "backward_fast_block_diag", ([&] { + backward_fast_block_diag_cuda_kernel<<>>( + grad_output.data_ptr(), + grad_input.data_ptr(), + z, N, b); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + printf("Error in backward_fast_block_diag_cuda_kernel: %s\n", cudaGetErrorString(err)); + + return {grad_input}; +} diff --git a/peft/src/peft/tuners/boft/layer.py b/peft/src/peft/tuners/boft/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..7232f39d176192ef9819d1395b6cd081c74c1d38 --- /dev/null +++ b/peft/src/peft/tuners/boft/layer.py @@ -0,0 +1,1011 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +from __future__ import annotations + +import math +import os +import warnings +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +_FBD_CUDA = None + + +# this function is a 1:1 copy from accelerate +@contextmanager +def patch_environment(**kwargs): + """ + A context manager that will add each keyword argument passed to `os.environ` and remove them when exiting. + + Will convert the values in `kwargs` to strings and upper-case all the keys. + + Example: + + ```python + >>> import os + >>> from accelerate.utils import patch_environment + + >>> with patch_environment(FOO="bar"): + ... print(os.environ["FOO"]) # prints "bar" + >>> print(os.environ["FOO"]) # raises KeyError + ``` + """ + existing_vars = {} + for key, value in kwargs.items(): + key = key.upper() + if key in os.environ: + existing_vars[key] = os.environ[key] + os.environ[key] = str(value) + + yield + + for key in kwargs: + key = key.upper() + if key in existing_vars: + # restore previous value + os.environ[key] = existing_vars[key] + else: + os.environ.pop(key, None) + + +def get_fbd_cuda(): + global _FBD_CUDA + + if _FBD_CUDA is not None: + return _FBD_CUDA + + # This import initializes cuda context and should thus be local, see issue 1877 + from torch.utils.cpp_extension import load + + curr_dir = os.path.dirname(__file__) + # need ninja to build the extension + try: + with patch_environment(CC="gcc", CXX="gcc"): + fbd_cuda = load( + name="fbd_cuda", + sources=[f"{curr_dir}/fbd/fbd_cuda.cpp", f"{curr_dir}/fbd/fbd_cuda_kernel.cu"], + verbose=True, + # build_directory='/tmp/' # for debugging + ) + # extra_cuda_cflags = ['-std=c++14', '-ccbin=$$(which gcc-7)']) # cuda10.2 is not compatible with gcc9. Specify gcc 7 + except Exception as e: + warnings.warn(f"Failed to load the CUDA extension: {e}, check if ninja is available.") + warnings.warn("Setting boft_n_butterfly_factor to 1 to speed up the finetuning process.") + fbd_cuda = None + + _FBD_CUDA = fbd_cuda + return _FBD_CUDA + + +class FastBlockDiag(Function): + """ + Implements a custom autograd Function for a fast block diagonal operation using CUDA. + + This function is optimized for 4D tensors where the last two dimensions are equal, representing block diagonal + matrices for efficient computation on CUDA devices. + """ + + @staticmethod + def forward(ctx, input): + """ + The forward method for FastBlockDiag. + + Computes the block diagonal operation on the input tensor using a CUDA-optimized function. This method assumes + that the input is a 4D tensor where the last two dimensions are equal, which represent the blocks to be + diagonalized. + + Parameters: + ctx: A context object that can be used to stash information for backward computation. + input (Tensor): The input tensor of shape (N, D, H, H), where `N` is the batch size, + `D` represents one additional dimension (In BOFT, the number of BOFT blocks), and `H` is the + size of the square blocks along the last two dimensions (In BOFT, the block size). + + Returns: + Tensor: The resulting tensor after applying the block diagonal operation, + will have the shape (N, DxH, DxH). + """ + output = get_fbd_cuda().forward(input)[0] + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + (input,) = ctx.saved_tensors + grad_input = get_fbd_cuda().backward(grad_output, input)[0] + return grad_input + + +class MultiplicativeDropoutLayer(nn.Module): + """ + Implements the multiplicative dropout layer for BOFT. + """ + + def __init__(self, p=0.0): + """ + Initializes the multiplicative dropout layer. + + Parameters: + p (float): The probability of dropping out a block. Defaults to 0.0. + """ + super().__init__() + self.p = p + + def forward(self, x): + """ + Applies multiplicative dropout to the input tensor. + + Parameters: + x (Tensor): The input tensor of shape (N, D, H, H), where `N` is the batch size, `D` represents + one additional dimension (In BOFT, the number of BOFT blocks), and `H` is the size of the square + blocks along the last two dimensions (In BOFT, the block size). + """ + if self.training: + # Ensure the last two dimensions are the same + if x.shape[-1] != x.shape[-2]: + raise ValueError("The last two dimensions of input should be the same!") + + N, D, H, _ = x.shape + + # Randomly select one from N + n_random = torch.randint(0, N, (1,)).item() + + # Create a mask with 1s for matrices to be replaced with identity and 0s otherwise + num_to_replace = int(self.p * D) + num_zeros = D - num_to_replace + + # Generate a flat tensor with desired number of 1s and 0s + mask = torch.cat([torch.ones(num_to_replace, device=x.device), torch.zeros(num_zeros, device=x.device)]) + + # Shuffle and reshape the mask + mask = mask[torch.randperm(D)].view(1, D, 1, 1) + + full_mask = torch.zeros(N, D, 1, 1, device=x.device) + full_mask[n_random] = mask + + # Use the mask to combine original matrices and identity matrices + eye_matrix = torch.eye(H, device=x.device).repeat(N, D, 1, 1) + x = (1 - full_mask) * x + full_mask * eye_matrix + return x + + +class BOFTLayer(BaseTunerLayer): + """ + Implements the BOFT layer. + """ + + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("boft_R", "boft_s") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("boft_block_size", "boft_block_num", "boft_dropout") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + """ + Initializes the BOFT layer. + + Note, currently only support linear layer and convolutional layer, with further support for other layers to be + added soon. + + Parameters: + base_layer: the pretrained model layer + """ + self.base_layer = base_layer + self.boft_block_size = {} + self.boft_block_num = {} + self.boft_dropout = nn.ModuleDict({}) + self.boft_R = nn.ParameterDict({}) + self.boft_s = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + self.kwargs = kwargs + + base_layer = self.get_base_layer() + + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, nn.Conv2d): + in_features, out_features = base_layer.in_channels, base_layer.out_channels + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + self.in_features = in_features + self.out_features = out_features + + def set_scale(self, adapter, scale): + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + + warnings.warn("Scaling operation for BOFT not supported! Automatically set scale to 1.") + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.boft_R.keys(): + continue + + warnings.warn("Scaling operation for BOFT not supported! Automatically set scale to 1.") + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.boft_R.keys(): + continue + + warnings.warn("Unscaling operation for BOFT not supported! Keeping scale to 1.") + + def update_layer( + self, + adapter_name, + boft_block_size, + boft_block_num, + boft_n_butterfly_factor, + boft_dropout, + init_weights, + inference_mode: bool = False, + **kwargs, + ): + """ + Update the linear layer with trainable BOFT weights. Override for other layer types. + """ + # Attempt to load the CUDA extension during model initialization + if not get_fbd_cuda(): + self.fbd_cuda_available = False + # If the CUDA extension is not available, set the butterfly factor to 1 to speed up the finetuning process + boft_n_butterfly_factor = 1 + else: + self.fbd_cuda_available = True + + # to be consistent with the paper notation + boft_n_butterfly_factor = boft_n_butterfly_factor - 1 + if boft_n_butterfly_factor < 0: + raise ValueError( + f"You can only specify boft_n_butterfly_factor {boft_n_butterfly_factor + 1} to be a positive integer number." + ) + + # Initialize the MultiplicativeDropoutLayer for boft_dropout > 0.0. + if boft_dropout > 0.0: + boft_dropout_layer = MultiplicativeDropoutLayer(p=boft_dropout) + else: + boft_dropout_layer = nn.Identity() + self.boft_dropout.update(nn.ModuleDict({adapter_name: boft_dropout_layer})) + + if boft_block_size == 0 and boft_block_num != 0: + if self.in_features % boft_block_num != 0: + raise ValueError( + f"in_features ({self.in_features}) must be divisible by boft_block_num ({boft_block_num})!" + ) + + if boft_n_butterfly_factor != 0: + if boft_n_butterfly_factor > int(math.log2(boft_block_num)): + raise ValueError( + f"Invalid combination of boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_num ({boft_block_num})!" + ) + if boft_block_num % (2**boft_n_butterfly_factor) != 0: + raise ValueError( + f"boft_block_num ({boft_block_num}) must be a multiple of 2 raised to the power of boft_n_butterfly_factor ({boft_n_butterfly_factor + 1})!" + ) + + boft_block_size = int(self.in_features // boft_block_num) + + elif boft_block_size != 0 and boft_block_num == 0: + if self.in_features % boft_block_size != 0: + raise ValueError( + f"in_features ({self.in_features}) must be divisible by boft_block_size ({boft_block_size})!" + ) + + if boft_n_butterfly_factor != 0: + if self.in_features < (boft_block_size * (2**boft_n_butterfly_factor)): + raise ValueError( + f"Invalid combination of in_features ({self.in_features}), boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_size ({boft_block_size})!" + ) + if self.in_features % (boft_block_size * (2**boft_n_butterfly_factor)) != 0: + raise ValueError( + f"Invalid combination of in_features ({self.in_features}), boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_size ({boft_block_size})!" + ) + + boft_block_num = int(self.in_features // boft_block_size) + + else: + raise ValueError( + "Something went wrong, please report this error: https://github.com/huggingface/peft/issues" + ) + + # In OFT you can specify the number of blocks to be 1 + if boft_n_butterfly_factor != 0: + if boft_block_num % 2 != 0: + raise ValueError(f"boft_block_num ({boft_block_num}) must be an even number!") + + if boft_block_size % 2 != 0: + raise ValueError(f"boft_block_size ({boft_block_size}) must be an even number!") + + # If there is no butterfly factor, then permutation matrix P will be an identity matrix. + P = torch.empty((boft_n_butterfly_factor + 1, self.in_features, self.in_features)) + for i in range(boft_n_butterfly_factor + 1): + perm = self.block_butterfly_perm( + self.in_features, int(boft_block_num / (2 ** (i))), int(boft_block_size / 2), boft_n_butterfly_factor + ) + perm_mat = self.perm2mat(perm) + P[i] = perm_mat + + self.register_buffer("boft_P", P, persistent=False) + + self.boft_R[adapter_name] = nn.Parameter( + torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size) + ) + self.boft_s[adapter_name] = nn.Parameter(torch.ones(int(self.out_features), 1)) + + self.reset_boft_parameters(adapter_name, init_weights) + + # set the boft block size and number + self.boft_block_size[adapter_name] = boft_block_size + self.boft_block_num[adapter_name] = boft_block_num + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_boft_parameters(self, adapter_name, init_weights): + """ + Reset the BOFT parameters. + """ + if init_weights is False: + nn.init.normal_(self.boft_R[adapter_name], mean=0.0, std=0.1) + nn.init.normal_(self.boft_s[adapter_name], mean=1.0, std=0.1) + return + + if adapter_name in self.boft_R.keys(): + if init_weights is True: + # initialize R to zero + nn.init.zeros_(self.boft_R[adapter_name]) + nn.init.ones_(self.boft_s[adapter_name]) + else: + raise ValueError(f"Unknown initialization {init_weights=}") + + def perm2mat(self, indices): + """ + Convert permutation indices to permutation matrix. + + Args: + indices: A list of indices representing the permutation. + """ + # Number of indices determines the size of the square matrix + n = len(indices) + + # Initialize a matrix of zeros + perm_mat = torch.zeros((n, n)) + + # Set the 1s according to the indices + for i, idx in enumerate(indices): + perm_mat[i, idx] = 1 + + return perm_mat + + def block_butterfly_perm(self, n, b, r=3, n_butterfly_factor=1): + """ + Define the permutation matrix for the block butterfly permutation. + + Args: + n: size of the permutation matrix + b: desired number of blocks after multiplying with the permutation matrix + r: base block size of the block diagonal matrix, e.g. 2x2, 3x3, 5x5 etc. + """ + + if n_butterfly_factor == 0: + return torch.arange(n) + + if b * r * 2 > n: + raise ValueError("Invalid number of blocks!") + + block_size = int(n // b) + indices = torch.arange(n) + + def sort_block(b, r): + step = b / r + initial_order = torch.arange(b) + sorted_order = torch.empty(b, dtype=torch.long) + + evens = torch.arange(0, step, 2) + odds = torch.arange(1, step, 2) + sorted_seq = torch.cat((evens, odds), dim=0) + for i, pos in enumerate(sorted_seq): + sorted_order[int(i * r) : int(i * r + r)] = initial_order[int(pos * r) : int(pos * r + r)] + return sorted_order + + sorted_order = sort_block(block_size, r) + + for i in range(0, n, block_size): + block_end = i + block_size + tmp_indices = indices[i:block_end] + indices[i:block_end] = tmp_indices[sorted_order] + return indices + + def cayley_batch(self, data): + """ + Perform the Cayley parametrization on a batch of skew-symmetric matrices. + + Args: + data: A batch of skew-symmetric matrices of shape (b, r, c). + """ + b, r, c = data.shape + # Ensure the input matrix is skew-symmetric + skew_mat = 0.5 * (data - data.transpose(1, 2)) + id_mat = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c) + + # Perform the Cayley parametrization + Q = torch.linalg.solve(id_mat + skew_mat, id_mat - skew_mat, left=False) + + return Q + + +class Linear(nn.Module, BOFTLayer): + """ + BOFT implemented in a dense layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + boft_block_size: int = 8, + boft_block_num: int = 0, + boft_n_butterfly_factor: int = 0, + boft_dropout: float = 0.1, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: Union[bool, str] = True, + is_target_conv_1d_layer: bool = False, + **kwargs, + ) -> None: + super().__init__() + BOFTLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + + self.update_layer( + adapter_name, boft_block_size, boft_block_num, boft_n_butterfly_factor, boft_dropout, init_weights + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.boft_R.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat, orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = orig_weight * boft_s + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + else: + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + orig_weight = base_layer.weight.data.clone() + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat, orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = orig_weight * boft_s + + self.base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.boft_R.keys(): + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + + orig_weight = base_layer.weight.data.clone() + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat.t(), orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + + base_layer.weight.data = (orig_weight * (1 / boft_s)).to(orig_dtype) + + def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + boft_R = self.boft_R[adapter] + boft_s = self.boft_s[adapter] + + N, D, H, _ = boft_R.shape + boft_R = boft_R.view(N * D, H, H) + orth_rotate_butterfly = self.cayley_batch(boft_R) + orth_rotate_butterfly = orth_rotate_butterfly.view(N, D, H, H) + if self.fbd_cuda_available: + block_diagonal_butterfly = FastBlockDiag.apply(orth_rotate_butterfly) + else: + orth_rotate_butterfly = orth_rotate_butterfly.squeeze(0) + block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) + block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) + + boft_P = self.boft_P.to(block_diagonal_butterfly.device) + butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) + butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) + butterfly_oft_mat = butterfly_oft_mat_batch[0] + + for i in range(1, butterfly_oft_mat_batch.shape[0]): + butterfly_oft_mat = butterfly_oft_mat_batch[i] @ butterfly_oft_mat + + return butterfly_oft_mat, boft_s + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + boft_rotation = torch.eye(self.in_features, device=x.device, dtype=previous_dtype) + boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=previous_dtype) + + for active_adapter in self.active_adapters: + if active_adapter not in self.boft_R.keys(): + continue + boft_R = self.boft_R[active_adapter] + boft_s = self.boft_s[active_adapter] + dropout = self.boft_dropout[active_adapter] + + N, D, H, _ = boft_R.shape + boft_R = boft_R.view(N * D, H, H) + orth_rotate_butterfly = self.cayley_batch(boft_R) + orth_rotate_butterfly = orth_rotate_butterfly.view(N, D, H, H) + orth_rotate_butterfly = dropout(orth_rotate_butterfly) + if self.fbd_cuda_available: + block_diagonal_butterfly = FastBlockDiag.apply(orth_rotate_butterfly) + else: + orth_rotate_butterfly = orth_rotate_butterfly.squeeze(0) + block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) + block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) + + # The BOFT author's cayley_batch, dropout and FastBlockDiag ONLY return fp32 outputs. + boft_P = self.boft_P.to(x) + block_diagonal_butterfly = block_diagonal_butterfly.to(x) + butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) + butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) + butterfly_oft_mat = butterfly_oft_mat_batch[0] + + for i in range(1, butterfly_oft_mat_batch.shape[0]): + butterfly_oft_mat = butterfly_oft_mat_batch[i] @ butterfly_oft_mat + + boft_rotation = butterfly_oft_mat @ boft_rotation + boft_scale = boft_s * boft_scale + + x = x.to(self.get_base_layer().weight.data.dtype) + + orig_weight = self.get_base_layer().weight.data + orig_weight = torch.transpose(orig_weight, 0, 1) + boft_rotation = boft_rotation.to(previous_dtype) + orig_weight = orig_weight.to(previous_dtype) + rotated_weight = torch.mm(boft_rotation, orig_weight) + rotated_weight = torch.transpose(rotated_weight, 0, 1) + + scaled_rotated_weight = rotated_weight * boft_scale + + scaled_rotated_weight = scaled_rotated_weight.to(previous_dtype) + if self.base_layer.bias is not None: + self.base_layer.bias = self.base_layer.bias.to(previous_dtype) + result = F.linear(input=x, weight=scaled_rotated_weight, bias=self.base_layer.bias) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "boft." + rep + + +class Conv2d(nn.Module, BOFTLayer): + """ + BOFT implemented in a Conv2d layer. + """ + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + boft_block_size: int = 8, + boft_block_num: int = 0, + boft_n_butterfly_factor: int = 0, + boft_dropout: float = 0.1, + init_weights: Union[bool, str] = True, + **kwargs, + ) -> None: + super().__init__() + BOFTLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, boft_block_size, boft_block_num, boft_n_butterfly_factor, boft_dropout, init_weights + ) + + def update_layer( + self, + adapter_name, + boft_block_size, + boft_block_num, + boft_n_butterfly_factor, + boft_dropout, + init_weights, + inference_mode: bool = False, + **kwargs, + ): + """ + Update the conv2d layer with trainable BOFT weights. + """ + + # Attempt to load the CUDA extension during model initialization + if not get_fbd_cuda(): + self.fbd_cuda_available = False + # If the CUDA extension is not available, set the butterfly factor to 1 to speed up the finetuning process + boft_n_butterfly_factor = 1 + else: + self.fbd_cuda_available = True + + # to be consistent with the paper notation + boft_n_butterfly_factor = boft_n_butterfly_factor - 1 + if boft_n_butterfly_factor < 0: + raise ValueError( + f"You can only specify boft_n_butterfly_factor {boft_n_butterfly_factor + 1} to be a positive integer number." + ) + + # Initialize the MultiplicativeDropoutLayer for boft_dropout > 0.0. + if boft_dropout > 0.0: + boft_dropout_layer = MultiplicativeDropoutLayer(p=boft_dropout) + else: + boft_dropout_layer = nn.Identity() + self.boft_dropout.update(nn.ModuleDict({adapter_name: boft_dropout_layer})) + + # layer information from the base layer + base_layer = self.get_base_layer() + conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + + # Initialize the BOFT parameters. + if boft_block_size == 0 and boft_block_num != 0: + if conv_filter_dim % boft_block_num != 0: + raise ValueError( + f"Convolutional kernel dimension ({conv_filter_dim}) must be divisible by boft_block_num ({boft_block_num})!" + ) + + if boft_n_butterfly_factor != 0: + if boft_n_butterfly_factor > int(math.log2(boft_block_num)): + raise ValueError( + f"Invalid combination of boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_num ({boft_block_num})!" + ) + if boft_block_num % (2**boft_n_butterfly_factor) != 0: + raise ValueError( + f"boft_block_num ({boft_block_num}) must be a multiple of 2 raised to the power of boft_n_butterfly_factor ({boft_n_butterfly_factor + 1})!" + ) + + boft_block_size = int(conv_filter_dim // boft_block_num) + + elif boft_block_size != 0 and boft_block_num == 0: + if conv_filter_dim % boft_block_size != 0: + raise ValueError( + f"Convolutional kernel dimension ({conv_filter_dim}) must be divisible by boft_block_size ({boft_block_size})!" + ) + + if boft_n_butterfly_factor != 0: + if conv_filter_dim < (boft_block_size * (2**boft_n_butterfly_factor)): + raise ValueError( + f"Invalid combination of convolutional kernel dimension ({conv_filter_dim}), boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_size ({boft_block_size})!" + ) + if conv_filter_dim % (boft_block_size * (2**boft_n_butterfly_factor)) != 0: + raise ValueError( + f"Invalid combination of convolutional kernel dimension ({conv_filter_dim}), boft_n_butterfly_factor ({boft_n_butterfly_factor + 1}) and boft_block_size ({boft_block_size})!" + ) + + boft_block_num = int(conv_filter_dim // boft_block_size) + + else: + raise ValueError( + "Something went wrong, please report this error: https://github.com/huggingface/peft/issues" + ) + + # In OFT you can specify the number of blocks to be 1 + if boft_n_butterfly_factor != 0: + if boft_block_num % 2 != 0: + raise ValueError(f"boft_block_num ({boft_block_num}) must be an even number!") + + if boft_block_size % 2 != 0: + raise ValueError(f"boft_block_size ({boft_block_size}) must be an even number!") + + # If there is no butterfly factor, then permutation matrix P will be an identity matrix. + P = torch.empty((boft_n_butterfly_factor + 1, conv_filter_dim, conv_filter_dim)) + for i in range(boft_n_butterfly_factor + 1): + perm = self.block_butterfly_perm( + conv_filter_dim, int(boft_block_num / (2 ** (i))), int(boft_block_size / 2), boft_n_butterfly_factor + ) + perm_mat = self.perm2mat(perm) + P[i] = perm_mat + + self.register_buffer("boft_P", P, persistent=False) + + self.boft_R[adapter_name] = nn.Parameter( + torch.zeros(boft_n_butterfly_factor + 1, boft_block_num, boft_block_size, boft_block_size) + ) + self.boft_s[adapter_name] = nn.Parameter(torch.ones(1, int(self.out_features))) + + self.reset_boft_parameters(adapter_name, init_weights) + + # set the boft block size and number + self.boft_block_size[adapter_name] = boft_block_size + self.boft_block_num[adapter_name] = boft_block_num + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.boft_R.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + + orig_weight = orig_weight.view( + self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + ) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat, orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = orig_weight * boft_s + orig_weight = orig_weight.view( + self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0] + ) + + self.base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + else: + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + + orig_weight = base_layer.weight.data.clone() + orig_weight = orig_weight.view( + self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + ) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat, orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = orig_weight * boft_s + orig_weight = orig_weight.view( + self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0] + ) + + self.base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.boft_R.keys(): + butterfly_oft_mat, boft_s = self.get_delta_weight(active_adapter) + + orig_weight = base_layer.weight.data.clone() + orig_weight = orig_weight.view( + self.out_features, + self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], + ) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = torch.mm(butterfly_oft_mat.t(), orig_weight.to(butterfly_oft_mat.dtype)) + orig_weight = torch.transpose(orig_weight, 0, 1) + orig_weight = orig_weight * (1 / boft_s) + orig_weight = orig_weight.view( + self.out_features, + self.in_features, + base_layer.kernel_size[0], + base_layer.kernel_size[0], + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + + def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + boft_R = self.boft_R[adapter] + boft_s = self.boft_s[adapter].transpose(0, 1) + + N, D, H, _ = boft_R.shape + boft_R = boft_R.view(N * D, H, H) + orth_rotate_butterfly = self.cayley_batch(boft_R) + orth_rotate_butterfly = orth_rotate_butterfly.view(N, D, H, H) + if self.fbd_cuda_available: + block_diagonal_butterfly = FastBlockDiag.apply(orth_rotate_butterfly) + else: + orth_rotate_butterfly = orth_rotate_butterfly.squeeze(0) + block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) + block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) + + boft_P = self.boft_P.to(block_diagonal_butterfly.device) + butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) + butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) + butterfly_oft_mat = butterfly_oft_mat_batch[0] + + for i in range(1, butterfly_oft_mat_batch.shape[0]): + butterfly_oft_mat = butterfly_oft_mat_batch[i] @ butterfly_oft_mat + + return butterfly_oft_mat, boft_s + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + boft_rotation = torch.eye( + self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + device=x.device, + dtype=x.dtype, + ) + boft_scale = torch.ones((int(self.out_features), 1), device=x.device, dtype=x.dtype) + + for active_adapter in self.active_adapters: + if active_adapter not in self.boft_R.keys(): + continue + boft_R = self.boft_R[active_adapter] + boft_s = self.boft_s[active_adapter].transpose(0, 1) + dropout = self.boft_dropout[active_adapter] + + N, D, H, _ = boft_R.shape + boft_R = boft_R.view(N * D, H, H) + orth_rotate_butterfly = self.cayley_batch(boft_R) + orth_rotate_butterfly = orth_rotate_butterfly.view(N, D, H, H) + orth_rotate_butterfly = dropout(orth_rotate_butterfly) + if self.fbd_cuda_available: + block_diagonal_butterfly = FastBlockDiag.apply(orth_rotate_butterfly) + else: + orth_rotate_butterfly = orth_rotate_butterfly.squeeze(0) + block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) + block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) + + boft_P = self.boft_P.to(x) + block_diagonal_butterfly = block_diagonal_butterfly.to(x) + butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) + butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) + butterfly_oft_mat = butterfly_oft_mat_batch[0] + + for i in range(1, butterfly_oft_mat_batch.shape[0]): + butterfly_oft_mat = butterfly_oft_mat_batch[i] @ butterfly_oft_mat + + boft_rotation = butterfly_oft_mat @ boft_rotation + boft_scale = boft_s * boft_scale + + x = x.to(self.base_layer.weight.data.dtype) + + orig_weight = self.base_layer.weight.data + orig_weight = orig_weight.view( + self.out_features, + self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + ) + orig_weight = torch.transpose(orig_weight, 0, 1) + rotated_weight = torch.mm(boft_rotation, orig_weight) + rotated_weight = torch.transpose(rotated_weight, 0, 1) + + scaled_rotated_weight = rotated_weight * boft_scale + + scaled_rotated_weight = scaled_rotated_weight.view( + self.out_features, self.in_features, self.base_layer.kernel_size[0], self.base_layer.kernel_size[0] + ) + x = self._cast_input_dtype(x, scaled_rotated_weight.dtype) + bias = self._cast_input_dtype(self.base_layer.bias, scaled_rotated_weight.dtype) + result = F.conv2d( + input=x, + weight=scaled_rotated_weight, + bias=bias, + padding=self.base_layer.padding[0], + stride=self.base_layer.stride[0], + ) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "boft." + rep diff --git a/peft/src/peft/tuners/boft/model.py b/peft/src/peft/tuners/boft/model.py new file mode 100644 index 0000000000000000000000000000000000000000..11bfa45ccb898fa5df101121892325a2ef54dc71 --- /dev/null +++ b/peft/src/peft/tuners/boft/model.py @@ -0,0 +1,131 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The implementation is based on "Parameter-Efficient Orthogonal Finetuning +# via Butterfly Factorization" (https://huggingface.co/papers/2311.06243) in ICLR 2024. + +import warnings + +import torch + +from peft.tuners.tuners_utils import ( + BaseTuner, + BaseTunerLayer, +) +from peft.utils import TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING + +from .layer import BOFTLayer, Conv2d, Linear + + +class BOFTModel(BaseTuner): + """ + Creates BOFT and OFT model from a pretrained transformers model. Paper: https://huggingface.co/papers/2311.06243 + https://huggingface.co/papers/2306.07280 + + Args: + model ([`transformers.PreTrainedModel`]): The model to be adapted. + config ([`BOFTConfig`]): The configuration of the BOFT model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The BOFT model. + + Example:: + + >>> import transformers >>> from transformers import AutoModelForSeq2SeqLM, BOFTConfig >>> from peft import + BOFTConfig, get_peft_model + + >>> config = BOFTConfig( ... boft_block_size=8, ... boft_n_butterfly_factor=1, ... target_modules=["query", + "value", "key", "output.dense", "mlp.fc1", "mlp.fc2"], ... boft_dropout=0.1, ... bias="boft_only", ... + modules_to_save=["classifier"], ... ) + + >>> model = transformers.Dinov2ForImageClassification.from_pretrained( ... "facebook/dinov2-large", ... + num_labels=100, ... ) >>> boft_model = get_peft_model(model, config) + + **Attributes**: + - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`BOFTConfig`]): The configuration of the BOFT model. + """ + + prefix: str = "boft_" + tuner_layer_cls = BOFTLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + boft_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "boft_block_size": boft_config.boft_block_size, + "boft_block_num": boft_config.boft_block_num, + "boft_n_butterfly_factor": boft_config.boft_n_butterfly_factor, + "boft_dropout": boft_config.boft_dropout, + "fan_in_fan_out": boft_config.fan_in_fan_out, + "init_weights": boft_config.init_weights, + } + kwargs["bias"] = bias + + # If it is not a BOFTLayer, create a new module, else update it with new adapters + if not isinstance(target, BOFTLayer): + new_module = self._create_new_module(boft_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + boft_block_size=boft_config.boft_block_size, + boft_block_num=boft_config.boft_block_num, + boft_n_butterfly_factor=boft_config.boft_n_butterfly_factor, + boft_dropout=boft_config.boft_dropout, + init_weights=boft_config.init_weights, + ) + + @staticmethod + def _create_new_module(boft_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = boft_config.fan_in_fan_out = False + new_module = Linear(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Conv2d): + new_module = Conv2d(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. " + "Currently, only `torch.nn.Linear` and `torch.nn.Conv2d` are supported." + ) + + return new_module diff --git a/peft/src/peft/tuners/bone/__init__.py b/peft/src/peft/tuners/bone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f131e8c17d3d11fa06c11a473cf2a54075cf443a --- /dev/null +++ b/peft/src/peft/tuners/bone/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import BoneConfig +from .layer import BoneLayer, BoneLinear +from .model import BoneModel + + +__all__ = ["BoneConfig", "BoneLayer", "BoneLinear", "BoneModel"] + +register_peft_method(name="bone", config_cls=BoneConfig, model_cls=BoneModel) diff --git a/peft/src/peft/tuners/bone/config.py b/peft/src/peft/tuners/bone/config.py new file mode 100644 index 0000000000000000000000000000000000000000..10ca673ddc5dc78a19da5f2712754f939e882fd1 --- /dev/null +++ b/peft/src/peft/tuners/bone/config.py @@ -0,0 +1,129 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class BoneConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`BoneModel`]. + + Args: + r (`int`): + The rank of Bone across different layers. It is best to set 'r' to an even number; otherwise, the default + initialization method will not work. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding + the output layer. If this is not specified, modules will be chosen according to the model architecture. If + the architecture is not known, an error will be raised -- in this case, you should specify the target + modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (bool | Literal["bat"]): + Different initializations correspond to different Bone variants. By default, setting True uses the Bone + structure, while "bat" selects the Bat structure. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field( + default=64, + metadata={ + "help": "The rank of Bone across different layers.", + "note": "It is best to set 'r' to an even number; otherwise, the default initialization method will not work.", + }, + ) + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with Bone.", + "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ", + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from Bone."}, + ) + init_weights: bool | Literal["bat"] = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the Bone layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + }, + ) + bias: str = field(default="none", metadata={"help": "Bias type for Bone. Can be 'none', 'all' or 'bone_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from Bone layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.BONE + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + + warnings.warn( + "Bone will be removed in v0.19.0 of PEFT, use `MissConfig` instead. " + "If you already have a Bone checkpoint, you can use `/scripts/convert-bone-to-miss.py` to convert it into " + ) diff --git a/peft/src/peft/tuners/bone/layer.py b/peft/src/peft/tuners/bone/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..2cd04c57c8701a794970705cf05a28a087dbbec2 --- /dev/null +++ b/peft/src/peft/tuners/bone/layer.py @@ -0,0 +1,352 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +class BoneLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("bone_block",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("bone_r",) + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.bone_r = {} + self.bone_block = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def update_layer( + self, + adapter_name: str, + r: int, + init_weights: bool, + inference_mode: bool = False, + **kwargs, + ) -> None: + """Internal function to create bone adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + init_weights (`bool`): Whether to initialize weights. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.bone_r[adapter_name] = r + + # Determine shape of Bone weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.bone_block[adapter_name] = nn.Parameter(torch.zeros(r, self.out_features), requires_grad=True) + + else: + raise TypeError(f"Bone is not implemented for base layers of type {type(base_layer).__name__}") + + # Initialize weights + if init_weights == "bat": + if self.in_features % r != 0 or self.out_features % r != 0: + raise ValueError("The weight matrix must be fully divisible into [r, r] blocks.") + self.reset_bat_parameters(adapter_name, r) + elif init_weights: + self.reset_bone_parameters(adapter_name, r) + else: + self.reset_bone_parameters_random(adapter_name) + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_bone_parameters(self, adapter_name: str, r): + self.bone_block[adapter_name] = nn.Parameter(torch.zeros(r, self.out_features), requires_grad=True) + + def reset_bat_parameters(self, adapter_name: str, r): + self.bone_block[adapter_name] = nn.Parameter(torch.zeros(self.out_features // r, r, r), requires_grad=True) + + def reset_bone_parameters_random(self, adapter_name: str): + nn.init.kaiming_uniform_(self.bone_block[adapter_name], a=math.sqrt(5)) + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.bone_block.keys(): + continue + + warnings.warn("Scaling operation for Bone not supported! Automatically set scale to 1.") + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.bone_block.keys(): + continue + + warnings.warn("Unscaling operation for Bone not supported! Keeping scale at 1.") + + +class BoneLinear(nn.Module, BoneLayer): + """ + Bone implemented in a dense layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + init_weights: Union[bool, str] = True, + **kwargs, + ) -> None: + super().__init__() + BoneLayer.__init__(self, base_layer, **kwargs) + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, init_weights, **kwargs) + self.bone_fn = init_weights + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.bone_block.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + if self.bone_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, orig_weight) + orig_weight += delta_weight + else: + delta_weight = self.get_delta_weight_bone(active_adapter, self.base_layer.weight.data) + orig_weight = delta_weight + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + else: + if self.bone_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, self.base_layer.weight.data) + base_layer.weight.data += delta_weight.to(orig_dtype) + else: + delta_weight = self.get_delta_weight_bone(active_adapter, self.base_layer.weight.data) + base_layer.weight.data = delta_weight.to(orig_dtype) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.bone_block.keys(): + orig_weight = self.get_base_layer().weight.data.clone() + if self.bone_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, orig_weight, re=True) + else: + delta_weight = self.get_delta_weight_bone(active_adapter, orig_weight, re=True) + + base_layer.weight.data = delta_weight.to(orig_dtype) + + def get_delta_weight(self, adapter, orig_weight, re: bool = False) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.bone_block[adapter].device + dtype = self.bone_block[adapter].dtype + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_bone = self.bone_block[adapter] + + if cast_to_fp32: + weight_bone = weight_bone.float() + orig_weight = orig_weight.to(weight_bone.dtype) + + r = weight_bone.size(-1) + if re: + o = orig_weight.reshape(orig_weight.size(0) // r, r, orig_weight.size(1) // r, r).permute(2, 0, 1, 3) + one = torch.eye(weight_bone.size(-1)).to(weight_bone.device) + # inverse must be in float32, after that the dtype can be adjusted if needed + inv_I_plus_b = torch.inverse(one + weight_bone) + inv_I_plus_b = inv_I_plus_b.to(weight_bone.dtype) + w = (o - weight_bone) @ inv_I_plus_b + output_tensor = w.permute(1, 2, 0, 3).reshape(*orig_weight.shape) + else: + w = ( + orig_weight.reshape(orig_weight.size(0) // r, r, orig_weight.size(1) // r, r).permute(2, 0, 1, 3) + @ weight_bone + + weight_bone + ) + output_tensor = w.permute(1, 2, 0, 3).reshape(*orig_weight.shape) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.bone_block[adapter].data = weight_bone.to(dtype) + + return output_tensor + + def get_delta_weight_bone(self, adapter, orig_weight, re: bool = False) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.bone_block[adapter].device + dtype = self.bone_block[adapter].dtype + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_bone = self.bone_block[adapter] + + if cast_to_fp32: + weight_bone = weight_bone.float() + + in_features = orig_weight.size(-1) + r = weight_bone.size(0) + if in_features % r != 0: + last_size = in_features % r + n_block = in_features // r + n_block_size = n_block * r + + if re: + orig_weight[:, :n_block_size] = ( + (orig_weight[:, :n_block_size].reshape(-1, n_block, r).permute(1, 2, 0) - weight_bone) + .permute(2, 0, 1) + .reshape(*orig_weight[:, :n_block_size].shape) + ) + orig_weight[:, n_block_size:] = ( + orig_weight[:, n_block_size:] - (weight_bone.transpose(0, 1))[:, :last_size] + ) + else: + orig_weight[:, :n_block_size] = ( + (orig_weight[:, :n_block_size].reshape(-1, n_block, r).permute(1, 2, 0) + weight_bone) + .permute(2, 0, 1) + .reshape(*orig_weight[:, :n_block_size].shape) + ) + orig_weight[:, n_block_size:] = ( + orig_weight[:, n_block_size:] + (weight_bone.transpose(0, 1))[:, :last_size] + ) + output_tensor = orig_weight + + else: + if re: + w = orig_weight.reshape(-1, orig_weight.size(1) // r, r).permute(1, 2, 0) - weight_bone + output_tensor = w.permute(2, 0, 1).reshape(*orig_weight.shape) + else: + w = orig_weight.reshape(-1, orig_weight.size(1) // r, r).permute(1, 2, 0) + weight_bone + output_tensor = w.permute(2, 0, 1).reshape(*orig_weight.shape) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.bone_block[adapter].data = weight_bone.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + if self.bone_fn == "bat": + orig_weight = self.base_layer.weight.data.clone() + for active_adapter in self.active_adapters: + if active_adapter not in self.bone_block.keys(): + continue + delta_weight = self.get_delta_weight(active_adapter, orig_weight) + orig_weight = orig_weight + delta_weight + + x = self._cast_input_dtype(x, orig_weight.dtype) + bias = self._cast_input_dtype(self.base_layer.bias, orig_weight.dtype) + result = F.linear(input=x, weight=orig_weight, bias=bias) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.bone_block.keys(): + continue + bone = self.bone_block[active_adapter] + r = bone.size(0) + if x.size(-1) % r != 0: + padding_size = (r - x.size(-1) % r) % r + x = F.pad(x, (0, padding_size)) + x = self._cast_input_dtype(x, bone.dtype) + result = result + torch.sum(x.reshape(*x.shape[:-1], x.size(-1) // r, r), dim=-2) @ bone + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "bone." + rep diff --git a/peft/src/peft/tuners/bone/model.py b/peft/src/peft/tuners/bone/model.py new file mode 100644 index 0000000000000000000000000000000000000000..412489e75a4296cfa8814be193212c465da91145 --- /dev/null +++ b/peft/src/peft/tuners/bone/model.py @@ -0,0 +1,126 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING + +from .layer import BoneLayer, BoneLinear + + +class BoneModel(BaseTuner): + """ + Creates Householder reflection adaptation (Bone) model from a pretrained model. The method is described in + https://huggingface.co/papers/2409.15371 + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`BoneConfig`]): The configuration of the Bone model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The Bone model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import BoneModel, BoneConfig + + >>> config_te = BoneConfig( + ... r=8, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... init_weights=True, + ... ) + >>> config_unet = BoneConfig( + ... r=8, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... init_weights=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = BoneModel(model.text_encoder, config_te, "default") + >>> model.unet = BoneModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`BoneConfig`]): The configuration of the Bone model. + """ + + prefix: str = "bone_" + tuner_layer_cls = BoneLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + bone_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": bone_config.r, + "init_weights": bone_config.init_weights, + } + kwargs["bias"] = bias + + # If it is not a BoneLayer, create a new module, else update it with new adapters + if not isinstance(target, BoneLayer): + new_module = self._create_new_module(bone_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + r=bone_config.r, + init_weights=bone_config.init_weights, + ) + + @staticmethod + def _create_new_module(bone_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + new_module = BoneLinear(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only `torch.nn.Linear` is supported." + ) + + return new_module diff --git a/peft/src/peft/tuners/c3a/__init__.py b/peft/src/peft/tuners/c3a/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8a7cbf61845f934737e11be277a7e2f48e7b34 --- /dev/null +++ b/peft/src/peft/tuners/c3a/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from peft.utils import register_peft_method + +from .config import C3AConfig +from .layer import C3ALayer, C3ALinear +from .model import C3AModel + + +__all__ = ["C3AConfig", "C3ALayer", "C3ALinear", "C3AModel"] + +register_peft_method(name="c3a", model_cls=C3AModel, config_cls=C3AConfig) diff --git a/peft/src/peft/tuners/c3a/config.py b/peft/src/peft/tuners/c3a/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2059c77a417f9f4eaa54f9d1f73943836646d328 --- /dev/null +++ b/peft/src/peft/tuners/c3a/config.py @@ -0,0 +1,137 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class C3AConfig(PeftConfig): + """This is the configuration class to store the configuration of a [`C3AModel`]. + + Args: + block_size (`int`): + block size for C3A, must be divisible by both the input size and the output size of the target layer. If + you have no idea what block_size you should use, set it to the greatest common divisor of all input & + output sizes of your target layers. Increasing this would result in less parameters. + target_modules (`Union[list[str],str]`): The names of the modules to apply C3A to. + bias (`str`): Bias type for C3A. Can be 'none', 'all' or 'c3a_only'. If 'all' or 'c3a_only', the + corresponding biases will be updated during training. Be aware that this means that, even when disabling + the adapters, the model will not produce the same output as the base model would have without adaptation. + modules_to_save (`list[str]`):list of modules apart from C3A layers to be set as trainable + and saved in the final checkpoint. + layers_to_transform (`Union[list[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply C3A on the layer indexes that + are specified in this list. If a single integer is passed, it will apply C3A on the layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer + pattern is not in the common layers pattern. + block_size_pattern (`dict`): + The mapping from layer names or regexp expression to block_size which are different from the default + specified. For example, `{"model.decoder.layers.0.encoder_attn.k_proj": 1280`} + init_weights (`Union[bool, Literal["gaussian", "kaiming_uniform", "xavier_uniform"]]`): + Defaults to 'xavier_uniform'. Setting this to `False` also uses 'xavier_uniform'. To set the weights to + zeros (thus making C3A a no-op), set the value to `True`. + """ + + block_size: int = field( + default=256, + metadata={ + "help": ( + "block size for C3A, must be divisible by both the input size and the output size of the target layer." + " If you have no idea what block_size you should use, set it to the greatest common divisor of all" + " input & output sizes of your target layers. Increasing this would result in less parameters." + ) + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "list of module names or regex expression of the module names to replace with C3A." + " For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + ) + }, + ) + bias: str = field(default="none", metadata={"help": "Bias type for C3A. Can be 'none', 'all' or 'c3a_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "list of modules apart from C3A layers to be set as trainable and saved in the final checkpoint." + " For example, in Sequence Classification or Token Classification tasks," + " the final layer `classifier/score` are randomly initialized" + " and as such need to be trainable and saved." + ) + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified," + " PEFT will transform only the layers indexes that are specified inside this list." + " If a single integer is passed, PEFT will transform only the layer at this index." + " This only works when target_modules is a list of str." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None" + " and if the layer pattern is not in the common layers pattern." + " This only works when target_modules is a list of str." + ) + }, + ) + block_size_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to block_size" + " which are different from the default specified." + " For example, `{model.decoder.layers.0.encoder_attn.k_proj: 1280`}" + ) + }, + ) + init_weights: Optional[Union[bool, Literal["gaussian", "kaiming_uniform", "xavier_uniform"]]] = field( + default="xavier_uniform", + metadata={ + "help": ( + "Defaults to 'xavier_uniform'. Setting this to `False` also uses 'xavier_uniform'. To set the weights " + "to zeros (thus making C3A a no-op), set the value to `True`." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.C3A + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") diff --git a/peft/src/peft/tuners/c3a/layer.py b/peft/src/peft/tuners/c3a/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..0890ba0e2571df384e4a629f4902d963b6181e02 --- /dev/null +++ b/peft/src/peft/tuners/c3a/layer.py @@ -0,0 +1,202 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import warnings +from typing import Any, Literal, Optional + +import torch +import torch.nn as nn + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .utils import BlockCircularConvolution, get_circulant_fast + + +class C3ALayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("c3a_kernel",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("block_size",) + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.block_size = {} + self.c3a_kernel = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def get_delta_weight(self, adapter) -> torch.Tensor: + if adapter not in self.c3a_kernel.keys(): + raise ValueError(f"Adapter {adapter} not found.") + base_layer_weight = self.get_base_layer().weight + base_layer_weight_dtype = base_layer_weight.dtype + c3a_kernel = self.c3a_kernel[adapter] + + delta_weight = get_circulant_fast(c3a_kernel.to(torch.float32)).to(base_layer_weight_dtype) + return delta_weight / base_layer_weight.size(-1) + + def update_layer(self, adapter_name, block_size, init_weights, inference_mode: bool = False, **kwargs): + if block_size <= 0: + raise ValueError(f"`block_size` should be a positive integer value but the value passed is {block_size}") + if self.in_features % block_size != 0: + raise ValueError( + f"The block size should be a factor of the input size. However, the input size is {self.in_features} and the block size is {block_size}" + ) + if self.out_features % block_size != 0: + raise ValueError( + f"The block size should be a factor of the output size. However, the output size is {self.out_features} and the block size is {block_size}" + ) + + self.block_size[adapter_name] = block_size + + weight = self.get_base_layer().weight + self.c3a_kernel[adapter_name] = nn.Parameter( + torch.zeros( + self.out_features // block_size, + self.in_features // block_size, + block_size, + # Currently, only fp32 is widely supported for FFT (fp16 is only supported on GPU with shapes of powers + # of 2, bf16 lacks FFT support) + dtype=torch.float32, + device=weight.device, + ) + ) + + self.reset_c3a_parameters(adapter_name, init_weights) + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + @torch.no_grad() + def reset_c3a_parameters(self, adapter_name, init_weights): + if init_weights is True: + return + + if adapter_name in self.c3a_kernel.keys(): + if init_weights == "gaussian": + nn.init.normal_(self.c3a_kernel[adapter_name]) + elif init_weights in ["xavier_uniform", False]: + fan_in, fan_out = self.in_features, self.out_features + std = 1.0 * math.sqrt(2.0 / float(fan_in + fan_out)) + a = math.sqrt(3.0) * std + nn.init.uniform_(self.c3a_kernel[adapter_name], -a, a) + elif init_weights == "kaiming_uniform": + fan_in = self.in_features + a = 1.0 * math.sqrt(1.0 / float(fan_in)) + nn.init.uniform_(self.c3a_kernel[adapter_name], -a, a) + else: + raise ValueError(f"Unknown init_weights: {init_weights}") + + +class C3ALinear(nn.Module, C3ALayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + block_size: int, + init_weights: bool | Literal["gaussian", "kaiming_uniform", "xavier_uniform"], + **kwargs, + ) -> None: + super().__init__() + C3ALayer.__init__(self, base_layer, **kwargs) + self._active_adapter = adapter_name + self.update_layer(adapter_name, block_size, init_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.c3a_kernel.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter) + orig_weights = orig_weights + delta_weight + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data = base_layer.weight.data + delta_weight + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.c3a_kernel.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + x = x.to(torch.float32) + for active_adapter in self.active_adapters: + if active_adapter not in self.c3a_kernel.keys(): + continue + c3a_kernel = self.c3a_kernel[active_adapter].to(torch.float32) + x = BlockCircularConvolution.apply(x, c3a_kernel) / x.size(-1) + result += x.to(result.dtype) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "c3a." + rep diff --git a/peft/src/peft/tuners/c3a/model.py b/peft/src/peft/tuners/c3a/model.py new file mode 100644 index 0000000000000000000000000000000000000000..6e71973691717551b5bd442873f9a6da378e34aa --- /dev/null +++ b/peft/src/peft/tuners/c3a/model.py @@ -0,0 +1,97 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import re +from itertools import chain + +import torch + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, +) + +from .layer import C3ALayer, C3ALinear + + +class C3AModel(BaseTuner): + """ + Creates C3A model from a pretrained transformers model. + + The method is described in detail in [TODO]. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`C3AConfig`]): The configuration of the C3A model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The C3A model. + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`C3AConfig`]): The configuration of the C3A model. + """ + + prefix: str = "c3a_" + tuner_layer_cls = C3ALayer + target_module_mapping = TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + c3a_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(chain(c3a_config.block_size_pattern.keys())) + target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key) + + block_size = c3a_config.block_size_pattern.get(target_name_key, c3a_config.block_size) + kwargs = { + "block_size": block_size, + "init_weights": c3a_config.init_weights, + } + + if isinstance(target, C3ALinear): + target.update_layer( + adapter_name, + block_size, + c3a_config.init_weights, + ) + else: + new_module = self._create_new_module(c3a_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(c3a_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + new_module = C3ALinear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/c3a/utils.py b/peft/src/peft/tuners/c3a/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..89b80ec5797a514a368c9138405871cb620602d6 --- /dev/null +++ b/peft/src/peft/tuners/c3a/utils.py @@ -0,0 +1,48 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from torch.autograd import Function +from torch.fft import fft, ifft + + +def get_circulant_fast(w): + m, n, b = w.shape + x = torch.eye(n * b, dtype=w.dtype, device=w.device) + x = x.reshape(*x.shape[:-1], n, b) + x = torch.einsum("...nb,mnb->...mb", ifft(x), fft(w)) + x = fft(x).real.flatten(start_dim=1).T + return x + + +class BlockCircularConvolution(Function): + @staticmethod + def forward(ctx, x, w): + m, n, b = w.shape + x = x.reshape(*x.shape[:-1], n, b) + ctx.save_for_backward(x, w) + x = torch.einsum("...nb,mnb->...mb", ifft(x), fft(w)) + x = fft(x).real + x = x.reshape(*x.shape[:-2], -1) + return x + + @staticmethod + def backward(ctx, grad_output): + x, w = ctx.saved_tensors + m, n, b = w.shape + grad_output = grad_output.reshape(*grad_output.shape[:-1], m, b) + grad_output_fft = fft(grad_output) + x_grad = fft(torch.einsum("...mb,mnb->...nb", grad_output_fft, ifft(w))).real + x_grad = x_grad.reshape(*x_grad.shape[:-2], -1) + w_grad = fft(torch.einsum("...mb,...nb->mnb", grad_output_fft, ifft(x))).real + return x_grad, w_grad diff --git a/peft/src/peft/tuners/cpt/__init__.py b/peft/src/peft/tuners/cpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd4de85989168ad6834712679c805cba39c1df1 --- /dev/null +++ b/peft/src/peft/tuners/cpt/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from peft.utils import register_peft_method + +from .config import CPTConfig +from .model import CPTEmbedding + + +__all__ = ["CPTConfig", "CPTEmbedding"] + +register_peft_method(name="cpt", config_cls=CPTConfig, model_cls=CPTEmbedding) diff --git a/peft/src/peft/tuners/cpt/config.py b/peft/src/peft/tuners/cpt/config.py new file mode 100644 index 0000000000000000000000000000000000000000..324c22995da217655da841e3a919f92ed6a0794b --- /dev/null +++ b/peft/src/peft/tuners/cpt/config.py @@ -0,0 +1,106 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass, field +from typing import Literal, Optional + +from peft.config import PromptLearningConfig +from peft.utils import PeftType, TaskType + + +@dataclass +class CPTConfig(PromptLearningConfig): + """ + CPT Configuration class extending PeftConfig for Context-aware Prompt Tuning (CPT). + + This class introduces additional parameters required for CPT, such as: + - Token type masks + - Prompt tuning initialization + - Loss weighting + - Projection settings + + For more details, see the paper: https://huggingface.co/papers/2410.17222 + """ + + # Token-related configurations + cpt_token_ids: Optional[list[int]] = field( + default=None, metadata={"help": "Tensor of token IDs used for CPT prompts."} + ) + cpt_mask: Optional[list[int]] = field(default=None, metadata={"help": "Tensor mask applied to CPT tokens."}) + cpt_tokens_type_mask: Optional[list[int]] = field( + default=None, metadata={"help": "Mask indicating the type of each CPT token."} + ) + + # Loss-related configurations + opt_weighted_loss_type: Optional[Literal["none", "decay"]] = field( + default="none", metadata={"help": "Type of weighted loss: 'none' or 'decay'."} + ) + opt_loss_decay_factor: Optional[float] = field( + default=1.0, metadata={"help": "Factor for exponential decay in loss weighting."} + ) + + # Projection-related configurations + opt_projection_epsilon: Optional[float] = field( + default=0.1, metadata={"help": "Epsilon value for input projection."} + ) + opt_projection_format_epsilon: Optional[float] = field( + default=0.1, metadata={"help": "Epsilon value for format projection."} + ) + + # Tokenizer configuration + tokenizer_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`" + }, + ) + # Neet to define CPT-specific static attributes + is_prompt_learning = True # Indicates that CPT is a prompt-learning method. + + def __post_init__(self): + """ + Post-initialization hook to set additional attributes after the config is initialized. + """ + # CPT-specific static attributes + self.is_prompt_learning = True # Indicates that CPT is a prompt-learning method. + self.num_layers = None # Number of layers (optional, not always required). + self.token_dim = None # Dimension of token embeddings. + self.num_attention_heads = None # Number of attention heads (if applicable). + self.num_transformer_submodules = 1 # Number of transformer submodules used. + self.peft_type = PeftType.CPT # Specifies that the PEFT type is CPT. + if self.task_type != TaskType.CAUSAL_LM: + # TODO: adjust this to raise an error with PEFT v0.18.0 + warnings.warn( + f"{self.__class__.__name__} only supports task_type = {TaskType.CAUSAL_LM.value}, " + "setting it automatically. This will raise an error starting from PEFT v0.18.0.", + FutureWarning, + ) + self.task_type = TaskType.CAUSAL_LM # Ensures task type is causal language modeling. + + if self.cpt_token_ids is None: + self.cpt_token_ids = [0] + + self.num_virtual_tokens = len(self.cpt_token_ids) + + if self.cpt_mask is None: + self.cpt_mask = [1 for _ in self.cpt_token_ids] + + if self.cpt_tokens_type_mask is None: + self.cpt_tokens_type_mask = [1 for _ in self.cpt_token_ids] + + if not ( + len(self.cpt_token_ids) == len(self.cpt_mask) == len(self.cpt_tokens_type_mask) == self.num_virtual_tokens + ): + raise ValueError("cpt_token_ids, cpt_mask and cpt_tokens_type_mask must have the same length.") diff --git a/peft/src/peft/tuners/cpt/model.py b/peft/src/peft/tuners/cpt/model.py new file mode 100644 index 0000000000000000000000000000000000000000..934a3b7928c125bd433441f36cf4b26de15b535f --- /dev/null +++ b/peft/src/peft/tuners/cpt/model.py @@ -0,0 +1,200 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import torch +from torch.nn import CrossEntropyLoss + +from peft.utils.integrations import gather_params_ctx + + +class CPTEmbedding(torch.nn.Module): + """ + CPTEmbedding is a custom embedding layer designed for Context-aware Prompt Tuning (CPT) in PEFT. It initializes + embeddings, applies prompt-specific projections, and computes loss using label masks. + """ + + def __init__(self, config, word_embeddings): + """ + Initializes the CPTEmbedding module. + + Args: + config (Namespace): + Configuration object containing model hyperparameters and CPT-specific settings. + word_embeddings (torch.nn.Embedding): + The base word embedding layer used to initialize CPT embeddings. + """ + super().__init__() + self.config = copy.deepcopy(config) + num_virtual_tokens = config.num_virtual_tokens + + # Initialize embeddings with virtual token dimensions + self.embedding = torch.nn.Embedding(num_virtual_tokens, config.token_dim) + + # Initialize embeddings using text-based prompt tuning, if configured + if not config.inference_mode: + assert config.num_virtual_tokens == len(config.cpt_token_ids) + + init_token_ids = torch.LongTensor(config.cpt_token_ids).to(word_embeddings.weight.device) + with gather_params_ctx(word_embeddings.parameters()): + word_embedding_weights = word_embeddings(init_token_ids).detach().clone() + word_embedding_weights = word_embedding_weights.to(torch.float32) + self.embedding.weight = torch.nn.Parameter(word_embedding_weights) + + # Initialize delta embedding with zero weights + self.delta_embedding = torch.nn.Embedding(num_virtual_tokens, config.token_dim) + self.delta_embedding.weight.data = torch.zeros_like(self.delta_embedding.weight).to(torch.float32) + + # Apply hook for backward gradient updates + self.set_updated_tokens() + + def forward(self, indices): + """ + Computes the prompt embeddings and applies delta adjustments. + + Args: + indices (torch.Tensor): + Indices of the tokens to be embedded. + + Returns: + torch.Tensor: + Sum of prompt embeddings and delta embeddings. + """ + with torch.no_grad(): + prompt_embeddings = self.embedding(indices) + + self.delta_embedding.weight.data = self.get_projection() # Apply epsilon-based projection + + delta_prompt_embeddings = self.delta_embedding(indices) + + return prompt_embeddings + delta_prompt_embeddings + + def set_updated_tokens(self): + """ + Sets up a backward hook to selectively update token gradients based on the CPT token type mask. + """ + tensor_ICL_mask = torch.Tensor(self.config.cpt_tokens_type_mask).long() + mask_input_template = torch.remainder(tensor_ICL_mask, 4) == 1 + mask_input = torch.remainder(tensor_ICL_mask, 4) == 2 + mask_output_template = torch.remainder(tensor_ICL_mask, 4) == 3 + mask = mask_input_template | mask_input | mask_output_template + mask = mask.view(-1, 1) + + def backward_hook(grad): + grad = grad * mask.to(grad.device) # Apply mask to gradients + return grad + + self.delta_embedding.weight.register_hook(backward_hook) + + def get_epsilon(self): + cpt_tokens_type_mask = self.config.cpt_tokens_type_mask + + MIN_VALUE = 1e-10 + + # Calculate normalized epsilon values for input, output, and format tokens + normalized_format_eps = self.config.opt_projection_format_epsilon * torch.sqrt( + torch.Tensor([self.config.token_dim / 2048]) + ) + normalized_input_eps = self.config.opt_projection_epsilon * torch.sqrt( + torch.Tensor([self.config.token_dim / 2048]) + ) + + epsilon = torch.ones_like(torch.Tensor(cpt_tokens_type_mask)).to(torch.float32) * MIN_VALUE + cpt_tokens_type_mask = torch.Tensor(cpt_tokens_type_mask).long() + + epsilon[(cpt_tokens_type_mask > 0) & (torch.remainder(cpt_tokens_type_mask, 4) == 1)] = normalized_format_eps + epsilon[(cpt_tokens_type_mask > 0) & (torch.remainder(cpt_tokens_type_mask, 4) == 3)] = normalized_format_eps + epsilon[(cpt_tokens_type_mask > 0) & (torch.remainder(cpt_tokens_type_mask, 4) == 2)] = normalized_input_eps + + return epsilon + + def get_projection(self): + """ + Applies epsilon-based projection to the delta embeddings to control their norm. + """ + + # Apply projection to control delta embedding norm + with torch.no_grad(): + new_embeddings_weights = self.delta_embedding.weight.clone().to(self.delta_embedding.weight.device) + token_norm = torch.norm(new_embeddings_weights, p=2, dim=1) + + projection_mask = token_norm > 0 + if torch.any(projection_mask): + epsilon = self.get_epsilon().to(self.delta_embedding.weight.device) + new_embeddings_weights[projection_mask] *= ( + epsilon[projection_mask] / (token_norm[projection_mask].clamp(min=epsilon[projection_mask])) + ).view(-1, 1) + return new_embeddings_weights + + @staticmethod + def calculate_loss(base_model_output, labels, cpt_type_mask, config): + """ + Computes the loss for CPT models with optional exponential decay. + + Args: + base_model_output (ModelOutput): + Output from the base model containing logits. + labels (torch.Tensor): + Ground-truth labels for the input tokens. + cpt_type_mask (torch.Tensor): + Token type mask used for filtering valid loss terms. + config (Namespace): + Configuration object containing loss-related hyperparameters. + + Returns: + ModelOutput: + The base model output with computed loss. + """ + + device = base_model_output.logits.device + + lm_logits = base_model_output.logits + labels = labels.to(device) + + # Shift logits and labels for token prediction + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + shift_cpt_type_mask = cpt_type_mask[..., 1:].contiguous() + + shift_labels_bool = (shift_labels.clone().detach() != -100).bool() + batch_size, seq_length, vocab_size = shift_logits.shape + + # Compute cross-entropy loss + loss_fct = CrossEntropyLoss(reduction="none", ignore_index=-100) + loss = loss_fct( + shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length) + ) + loss = loss.view(batch_size, seq_length) + # Apply exponential decay weights to the loss + shift_labels_weights = shift_labels_bool.clone().detach().float() + + for i in range(batch_size): + idx_labels = (shift_cpt_type_mask[i] > 0) & (shift_cpt_type_mask[i] % 4 == 0) + labels_ids = shift_cpt_type_mask[i][idx_labels].unique() + + exponential_decay = torch.ones_like(shift_cpt_type_mask[i]).to(device=device).float() + decay_value = 1 + for label_mask_idx in torch.flip(labels_ids, [0]): + exponential_decay[shift_cpt_type_mask[i] == label_mask_idx] = decay_value + decay_value *= config.opt_loss_decay_factor + if config.opt_weighted_loss_type == "decay": + shift_labels_weights[i] *= exponential_decay + + # Compute the weighted mean loss + loss = (loss[shift_labels_bool] * shift_labels_weights[shift_labels_bool]).mean() + + base_model_output.loss = loss + + return base_model_output diff --git a/peft/src/peft/tuners/fourierft/__init__.py b/peft/src/peft/tuners/fourierft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dfe3f5d89e36123eefef7fc6e675c6b80bd8d44c --- /dev/null +++ b/peft/src/peft/tuners/fourierft/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import FourierFTConfig +from .layer import FourierFTLayer, FourierFTLinear +from .model import FourierFTModel + + +__all__ = ["FourierFTConfig", "FourierFTLayer", "FourierFTLinear", "FourierFTModel"] + +register_peft_method(name="fourierft", model_cls=FourierFTModel, config_cls=FourierFTConfig) diff --git a/peft/src/peft/tuners/fourierft/config.py b/peft/src/peft/tuners/fourierft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..dbbb80d8e080e9237b711473b44d55f6ff186239 --- /dev/null +++ b/peft/src/peft/tuners/fourierft/config.py @@ -0,0 +1,206 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class FourierFTConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`FourierFTModel`]. + + Args: + n_frequency (`int`): + Num of learnable frequencies for the Discrete Fourier Transform. 'n_frequency' is an integer that is + greater than 0 and less than or equal to d^2 (assuming the weight W has dimensions of d by d). + Additionally, it is the number of trainable parameters required to update each delta W weight. + 'n_frequency' will affect the performance and efficiency for PEFT. Specifically, it has little impact on + training speed, but higher values of it (typically) result in larger GPU memory costs and better accuracy. + With the same `target_modules`, the number of parameters of LoRA is (2*d*r/n_frequency) times that of + FourierFT. The following examples of settings regarding 'n_frequency' can be used as reference for users. + For NLU tasks with the RoBERTa-large model, adopting 'n_frequency': 1000 can almost achieve similar results + as 'r': 8 in LoRA. At this time, the number of parameters of LoRA is about 16 times that of FourierFT. For + image classification tasks with Vit-large models, adopting 'n_frequency': 3000 can almost achieve similar + results as 'r': 16 in LoRA, where the number of parameters of LoRA is about 11 times that of FourierFT. + scaling (`float`): + The scaling value for the delta W matrix. This is an important hyperparameter used for scaling, similar to + the 'lora_alpha' parameter in the LoRA method. 'scaling' can be determined during the hyperparameter search + process. However, if users want to skip this process, one can refer to the settings in the following + scenarios. This parameter can be set to 100.0 or 150.0 for both RoBERTa-base and RoBERTa-large models + across all NLU (GLUE) tasks. This parameter can be set to 300.0 for both LLaMA family models for all + instruction tuning. This parameter can be set to 300.0 for both ViT-base and ViT-large models across all + image classification tasks. + random_loc_seed (`int`): + Seed for the random location of the frequencies, i.e., the spectral entry matrix. + target_modules (`Union[list[str],str]`): + List of module names or regex expression of the module names to replace with FourierFT. For example, ['q', + 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Only linear layers are supported. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). + bias (`str`): + Bias type for FourierFT. Can be 'none', 'all' or 'fourier_only'. + modules_to_save (`list[str]`): + List of modules apart from FourierFT layers to be set as trainable and saved in the final checkpoint. For + example, in Sequence Classification or Token Classification tasks, the final layer `classifier/score` are + randomly initialized and as such need to be trainable and saved. + layers_to_transform (`Union[list[int],int]`): + The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes + that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at + this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is + not in the common layers pattern. This should target the `nn.ModuleList` of the model, which is often + called `'layers'` or `'h'`. + n_frequency_pattern (`dict`): + The mapping from layer names or regexp expression to n_frequency which are different from the default + specified. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 1000`}. + init_weights (`bool`): + The initialization of the Fourier weights. Set this to False (the default) if the spectrum are initialized + to a standard normal distribution. Set this to True if the spectrum are initialized to zeros. + """ + + n_frequency: int = field( + default=1000, + metadata={ + "help": ( + "Num of learnable frequencies for the Discrete Fourier Transform. 'n_frequency' is an integer that is" + "greater than 0 and less than or equal to d^2 (assuming the weight W has dimensions of d by d)." + "Additionally, it is the number of trainable parameters required to update each delta W weight." + "'n_frequency' will affect the performance and efficiency for PEFT. Specifically, it has little impact on" + "training speed, but higher values of it (typically) result in larger GPU memory costs and better accuracy." + "With the same `target_modules`, the number of parameters of LoRA is (2*d*r/n_frequency) times that of FourierFT." + "The following examples of settings regarding 'n_frequency' can be used as reference for users. For NLU" + "tasks with the RoBERTa-large model, adopting 'n_frequency': 1000 can almost achieve similar results as" + "'r': 8 in LoRA. At this time, the number of parameters of LoRA is about 16 times that of FourierFT." + "For image classification tasks with Vit-large models, adopting 'n_frequency': 3000 can almost achieve" + "similar results as 'r': 16 in LoRA, where the number of parameters of LoRA is about 11 times that of FourierFT." + ) + }, + ) + scaling: float = field( + default=150.0, + metadata={ + "help": ( + "The scaling value for the delta W matrix. This is an important hyperparameter used for scaling, similar to the" + "'lora_alpha' parameter in the LoRA method. 'scaling' can be determined during the hyperparameter search process." + "However, if users want to skip this process, one can refer to the settings in the following scenarios." + "This parameter can be set to 100.0 or 150.0 for both RoBERTa-base and RoBERTa-large models across all NLU (GLUE) tasks." + "This parameter can be set to 300.0 for both LLaMA family models for all instruction tuning." + "This parameter can be set to 300.0 for both ViT-base and ViT-large models across all image classification tasks." + ) + }, + ) + random_loc_seed: Optional[int] = field( + default=777, metadata={"help": "Seed for the random location of the frequencies."} + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with FourierFT." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from fourierft."}, + ) + bias: str = field( + default="none", metadata={"help": "Bias type for FourierFT. Can be 'none', 'all' or 'fourier_only'."} + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from FourierFT layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer" + " pattern is not in the common layers pattern. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + ) + }, + ) + n_frequency_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to n_frequency which are different from the default specified." + "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 500`}." + ) + }, + ) + init_weights: bool = field( + default=False, + metadata={ + "help": ( + "The initialization of the Fourier weights. Set this to False (the default) if the spectrum should be " + "initialized to a standard normal distribution. Set this to True if the spectrum should be initialized " + "to zeros." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.FOURIERFT + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") diff --git a/peft/src/peft/tuners/fourierft/layer.py b/peft/src/peft/tuners/fourierft/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..a03a57f118052d15e5f9cc4b53dc5c06cecd139e --- /dev/null +++ b/peft/src/peft/tuners/fourierft/layer.py @@ -0,0 +1,193 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +class FourierFTLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("fourierft_spectrum",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("fourierft_n_frequency", "fourierft_scaling", "fourierft_random_loc_seed") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.fourierft_n_frequency = {} + self.fourierft_scaling = {} + self.fourierft_spectrum = nn.ParameterDict({}) + self.indices = {} + self.fourierft_random_loc_seed = {} + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + self.in_features, self.out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def update_layer( + self, adapter_name, n_frequency, scaling, init_weights, random_loc_seed, inference_mode: bool = False, **kwargs + ): + if n_frequency <= 0: + raise ValueError(f"`n_frequency` should be a positive integer value but the value passed is {n_frequency}") + if n_frequency > self.in_features * self.out_features: + raise ValueError( + f"`n_frequency` should be less than or equal to the product of the input and output dimensions " + f"but the value passed is {n_frequency} and the product is {self.in_features * self.out_features}" + ) + self.fourierft_n_frequency[adapter_name] = n_frequency + self.fourierft_random_loc_seed[adapter_name] = random_loc_seed + self.indices[adapter_name] = torch.randperm( + self.out_features * self.in_features, + generator=torch.Generator().manual_seed(self.fourierft_random_loc_seed[adapter_name]), + )[:n_frequency] + self.indices[adapter_name] = torch.stack( + [self.indices[adapter_name] // self.in_features, self.indices[adapter_name] % self.in_features], dim=0 + ) + self.fourierft_scaling[adapter_name] = scaling + # Actual trainable parameters + self.fourierft_spectrum[adapter_name] = nn.Parameter(torch.randn(n_frequency), requires_grad=True) + + if init_weights: + self.reset_fourier_parameters(adapter_name) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + @torch.no_grad() + def reset_fourier_parameters(self, adapter_name): + if adapter_name in self.fourierft_spectrum.keys(): + nn.init.zeros_(self.fourierft_spectrum[adapter_name]) + + def get_delta_weight(self, adapter) -> torch.Tensor: + # careful: ifft2 does not work with float16 or bfloat16 + spectrum = self.fourierft_spectrum[adapter] + indices = self.indices[adapter].to(spectrum.device) + dense_spectrum = torch.zeros(self.out_features, self.in_features, device=spectrum.device) + dense_spectrum[indices[0, :], indices[1, :]] = spectrum.float() + delta_weight = torch.fft.ifft2(dense_spectrum).real * self.fourierft_scaling[adapter] + return delta_weight.to(spectrum.dtype) + + +class FourierFTLinear(nn.Module, FourierFTLayer): + # FourierFT implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + n_frequency: int = 1000, + scaling: float = 150.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: Union[bool, str] = False, + random_loc_seed: int = 777, + **kwargs, + ) -> None: + super().__init__() + FourierFTLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer(adapter_name, n_frequency, scaling, init_weights, random_loc_seed) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.fourierft_spectrum.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.fourierft_spectrum.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + return super().get_delta_weight(adapter) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.fourierft_spectrum.keys(): + continue + + delta_w = self.get_delta_weight(active_adapter) + x = x.to(delta_w.dtype) + result = result + F.linear(x, delta_w) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "fourierft." + rep diff --git a/peft/src/peft/tuners/fourierft/model.py b/peft/src/peft/tuners/fourierft/model.py new file mode 100644 index 0000000000000000000000000000000000000000..5347d90b1777fcef235215c15999d6b39fd09ede --- /dev/null +++ b/peft/src/peft/tuners/fourierft/model.py @@ -0,0 +1,128 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import re +import warnings +from itertools import chain + +import torch +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, +) + +from .layer import FourierFTLayer, FourierFTLinear + + +class FourierFTModel(BaseTuner): + """ + Creates FourierFT model from a pretrained transformers model. + + The method is described in detail in https://huggingface.co/papers/2405.03003. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`FourierFTConfig`]): The configuration of the FourierFT model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The FourierFT model. + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`FourierFTConfig`]): The configuration of the Fourier model. + """ + + prefix: str = "fourierft_" + tuner_layer_cls = FourierFTLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + fourierft_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(chain(fourierft_config.n_frequency_pattern.keys())) + target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key) + + n_frequency = fourierft_config.n_frequency_pattern.get(target_name_key, fourierft_config.n_frequency) + scaling = fourierft_config.scaling + random_loc_seed = fourierft_config.random_loc_seed + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "n_frequency": n_frequency, + "scaling": scaling, + "fan_in_fan_out": fourierft_config.fan_in_fan_out, + "init_weights": fourierft_config.init_weights, + "random_loc_seed": fourierft_config.random_loc_seed, + } + kwargs["bias"] = bias + if isinstance(target, FourierFTLayer): + target.update_layer( + adapter_name, + n_frequency, + scaling, + fourierft_config.init_weights, + random_loc_seed, + ) + else: + new_module = self._create_new_module(fourierft_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(fourierft_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = fourierft_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = fourierft_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + new_module = FourierFTLinear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/hra/__init__.py b/peft/src/peft/tuners/hra/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f5f6a54435d0fcaa1bd275c623168b5491f2d9e --- /dev/null +++ b/peft/src/peft/tuners/hra/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import HRAConfig +from .layer import HRAConv2d, HRALayer, HRALinear +from .model import HRAModel + + +__all__ = ["HRAConfig", "HRAConv2d", "HRALayer", "HRALinear", "HRAModel"] + +register_peft_method(name="hra", config_cls=HRAConfig, model_cls=HRAModel) diff --git a/peft/src/peft/tuners/hra/config.py b/peft/src/peft/tuners/hra/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9e6d87cc17c3ba003eac963467bb1253725ebe10 --- /dev/null +++ b/peft/src/peft/tuners/hra/config.py @@ -0,0 +1,133 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class HRAConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`HRAModel`]. + + Args: + r (`int`): + The rank of HRA across different layers. It is best to set 'r' to an even number; otherwise, the default + initialization method will not work. + apply_GS (`bool`): + Whether to apply Gram-Schmidt orthogonalization. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding + the output layer. If this is not specified, modules will be chosen according to the model architecture. If + the architecture is not known, an error will be raised -- in this case, you should specify the target + modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (`bool`): + Whether to perform initialization of HRA weights. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field( + default=8, + metadata={ + "help": "The rank of HRA across different layers.", + "note": "It is best to set 'r' to an even number; otherwise, the default initialization method will not work.", + }, + ) + apply_GS: bool = field( + default=False, + metadata={"help": "Whether to apply Gram-Schmidt orthogonalization or not."}, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with HRA.", + "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ", + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from HRA."}, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the HRA layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`." + }, + ) + bias: str = field(default="none", metadata={"help": "Bias type for HRA. Can be 'none', 'all' or 'hra_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from HRA layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.HRA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") diff --git a/peft/src/peft/tuners/hra/layer.py b/peft/src/peft/tuners/hra/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..55ab6db69a6730f0ef3b4e46ee46071426091269 --- /dev/null +++ b/peft/src/peft/tuners/hra/layer.py @@ -0,0 +1,461 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +class HRALayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("hra_u",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("hra_r", "hra_apply_GS") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.hra_r = {} + self.hra_apply_GS = {} + self.hra_u = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, nn.Conv2d): + self.in_features, self.out_features = base_layer.in_channels, base_layer.out_channels + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def update_layer( + self, + adapter_name: str, + r: int, + apply_GS: bool, + init_weights: bool, + inference_mode: bool = False, + **kwargs, + ) -> None: + """Internal function to create hra adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + init_weights (`bool`): Whether to initialize weights. + apply_GS (`bool`): Whether to apply Gram-Schmidt orthogonalization or not. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.hra_r[adapter_name] = r + self.hra_apply_GS[adapter_name] = apply_GS + + # Determine shape of HRA weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.hra_u[adapter_name] = nn.Parameter(torch.empty(self.in_features, r), requires_grad=True) + elif isinstance(base_layer, nn.Conv2d): + self.hra_u[adapter_name] = nn.Parameter( + torch.empty(self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], r), + requires_grad=True, + ) + else: + raise TypeError(f"HRA is not implemented for base layers of type {type(base_layer).__name__}") + + # Initialize weights + if init_weights: + self.reset_hra_parameters(adapter_name) + else: + self.reset_hra_parameters_random(adapter_name) + + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_hra_parameters(self, adapter_name: str): + if self.hra_r[adapter_name] % 2 != 0: + warnings.warn("The symmetric initialization can NOT be performed when r is odd!") + nn.init.kaiming_uniform_(self.hra_u[adapter_name], a=math.sqrt(5)) + else: + shape = self.hra_u[adapter_name].shape + half_u = torch.zeros(shape[0], shape[1] // 2) + nn.init.kaiming_uniform_(half_u, a=math.sqrt(5)) + self.hra_u[adapter_name] = nn.Parameter(torch.repeat_interleave(half_u, 2, dim=1)) + + def reset_hra_parameters_random(self, adapter_name: str): + nn.init.kaiming_uniform_(self.hra_u[adapter_name], a=math.sqrt(5)) + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.hra_u.keys(): + continue + + warnings.warn("Scaling operation for HRA not supported! Automatically set scale to 1.") + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.hra_u.keys(): + continue + + warnings.warn("Unscaling operation for HRA not supported! Keeping scale at 1.") + + +class HRALinear(nn.Module, HRALayer): + """ + HRA implemented in a dense layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + apply_GS: bool = False, + init_weights: Union[bool, str] = True, + **kwargs, + ) -> None: + super().__init__() + HRALayer.__init__(self, base_layer, **kwargs) + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, apply_GS, init_weights, **kwargs) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.hra_u.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter) + orig_weight = torch.mm(orig_weight.to(delta_weight.dtype), delta_weight) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + else: + delta_weight = self.get_delta_weight(active_adapter) + new_weight = torch.mm(base_layer.weight.data.to(delta_weight.dtype), delta_weight) + base_layer.weight.data = new_weight.to(orig_dtype) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.hra_u.keys(): + orig_weight = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter, reverse=True) + new_weight = torch.mm(orig_weight.to(delta_weight.dtype), delta_weight) + base_layer.weight.data = new_weight.to(orig_dtype) + + def get_delta_weight(self, adapter_name: str, reverse: bool = False) -> torch.Tensor: + rank = self.hra_r[adapter_name] + apply_GS = self.hra_apply_GS[adapter_name] + opt_u = self.hra_u[adapter_name] + shape = opt_u.shape + + if apply_GS: + weight = [(opt_u[:, 0] / opt_u[:, 0].norm()).view(-1, 1)] + for i in range(1, rank): + ui = opt_u[:, i].view(-1, 1) + for j in range(i): + ui = ui - (weight[j].t() @ ui) * weight[j] + weight.append((ui / ui.norm()).view(-1, 1)) + weight = torch.cat(weight, dim=1) + weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * weight @ weight.t() + + else: + opt_u = opt_u / opt_u.norm(dim=0) + weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) + if reverse: + indices = range(rank - 1, -1, -1) + else: + indices = range(rank) + + for i in indices: + ui = opt_u[:, i].view(-1, 1) + weight = weight - 2 * weight @ ui @ ui.t() + + return weight + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + new_weight = torch.eye(self.in_features, device=x.device) + + for active_adapter in self.active_adapters: + if active_adapter not in self.hra_u.keys(): + continue + delta_weight = self.get_delta_weight(active_adapter) + new_weight = torch.mm(new_weight.to(delta_weight.dtype), delta_weight) + + orig_weight = self.get_base_layer().weight.data + orig_weight = self._cast_input_dtype(orig_weight, new_weight.dtype) + new_weight = torch.mm(orig_weight, new_weight) + bias = self._cast_input_dtype(self.base_layer.bias, new_weight.dtype) + + if self.cast_input_dtype_enabled: + x = self._cast_input_dtype(x, new_weight.dtype) + else: + x = x.to(self.get_base_layer().weight.data.dtype) + result = F.linear(input=x, weight=new_weight, bias=bias) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "hra." + rep + + +class HRAConv2d(nn.Module, HRALayer): + """HRA implemented in Conv2d layer""" + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + apply_GS: bool = False, + init_weights: Union[bool, str] = True, + **kwargs, + ): + super().__init__() + HRALayer.__init__(self, base_layer) + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, apply_GS, init_weights, **kwargs) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.hra_u.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + orig_weight = orig_weight.view( + self.out_features, + self.in_features * base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + ) + delta_weight = self.get_delta_weight(active_adapter) + orig_weight = torch.mm(orig_weight.to(delta_weight.dtype), delta_weight) + orig_weight = orig_weight.view( + self.out_features, + self.in_features, + base_layer.kernel_size[0], + base_layer.kernel_size[0], + ) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + else: + orig_weight = base_layer.weight.data + orig_weight = orig_weight.view( + self.out_features, + self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + ) + delta_weight = self.get_delta_weight(active_adapter) + orig_weight = torch.mm(orig_weight.to(delta_weight.dtype), delta_weight) + orig_weight = orig_weight.view( + self.out_features, + self.in_features, + base_layer.kernel_size[0], + base_layer.kernel_size[0], + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.hra_u.keys(): + orig_weight = base_layer.weight.data.clone() + orig_weight = orig_weight.view( + self.out_features, + self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0], + ) + delta_weight = self.get_delta_weight(active_adapter, reverse=True) + orig_weight = torch.mm(orig_weight.to(delta_weight.dtype), delta_weight) + orig_weight = orig_weight.view( + self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0] + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + + def get_delta_weight(self, adapter_name: str, reverse: bool = False) -> torch.Tensor: + rank = self.hra_r[adapter_name] + apply_GS = self.hra_apply_GS[adapter_name] + opt_u = self.hra_u[adapter_name] + shape = opt_u.shape + + if apply_GS: + weight = [(opt_u[:, 0] / opt_u[:, 0].norm()).view(-1, 1)] + for i in range(1, rank): + ui = opt_u[:, i].view(-1, 1) + for j in range(i): + ui = ui - (weight[j].t() @ ui) * weight[j] + weight.append((ui / ui.norm()).view(-1, 1)) + weight = torch.cat(weight, dim=1) + weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) - 2 * weight @ weight.t() + + else: + opt_u = opt_u / opt_u.norm(dim=0) + weight = torch.eye(shape[0], device=opt_u.device, dtype=opt_u.dtype) + if reverse: + indices = range(rank - 1, -1, -1) + else: + indices = range(rank) + + for i in indices: + ui = opt_u[:, i].view(-1, 1) + weight = weight - 2 * weight @ ui @ ui.t() + + return weight + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + new_weight = torch.eye( + self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + device=x.device, + ) + for active_adapter in self.active_adapters: + if active_adapter not in self.hra_u.keys(): + continue + delta_weight = self.get_delta_weight(active_adapter) + new_weight = torch.mm(new_weight.to(delta_weight.dtype), delta_weight) + + orig_weight = self.base_layer.weight.data + orig_weight = orig_weight.view( + self.out_features, + self.in_features * self.base_layer.kernel_size[0] * self.base_layer.kernel_size[0], + ) + orig_weight = self._cast_input_dtype(orig_weight, new_weight.dtype) + bias = self._cast_input_dtype(self.base_layer.bias, new_weight.dtype) + + new_weight = torch.mm(orig_weight, new_weight) + new_weight = new_weight.view( + self.out_features, + self.in_features, + self.base_layer.kernel_size[0], + self.base_layer.kernel_size[0], + ) + + if self.cast_input_dtype_enabled: + x = self._cast_input_dtype(x, new_weight.dtype) + else: + x = x.to(self.get_base_layer().weight.data.dtype) + result = F.conv2d( + input=x, + weight=new_weight, + bias=bias, + padding=self.base_layer.padding[0], + stride=self.base_layer.stride[0], + ) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "hra." + rep diff --git a/peft/src/peft/tuners/hra/model.py b/peft/src/peft/tuners/hra/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e210d226b5294053788188b9d3b307b73c7d9457 --- /dev/null +++ b/peft/src/peft/tuners/hra/model.py @@ -0,0 +1,131 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING + +from .layer import HRAConv2d, HRALayer, HRALinear + + +class HRAModel(BaseTuner): + """ + Creates Householder reflection adaptation (HRA) model from a pretrained model. The method is described in + https://huggingface.co/papers/2405.17484 + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`HRAConfig`]): The configuration of the HRA model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The HRA model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import HRAModel, HRAConfig + + >>> config_te = HRAConfig( + ... r=8, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... init_weights=True, + ... ) + >>> config_unet = HRAConfig( + ... r=8, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... init_weights=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = HRAModel(model.text_encoder, config_te, "default") + >>> model.unet = HRAModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`HRAConfig`]): The configuration of the HRA model. + """ + + prefix: str = "hra_" + tuner_layer_cls = HRALayer + target_module_mapping = TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + hra_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": hra_config.r, + "apply_GS": hra_config.apply_GS, + "init_weights": hra_config.init_weights, + } + kwargs["bias"] = bias + + # If it is not a HRALayer, create a new module, else update it with new adapters + if not isinstance(target, HRALayer): + new_module = self._create_new_module(hra_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + r=hra_config.r, + apply_GS=hra_config.apply_GS, + init_weights=hra_config.init_weights, + ) + + @staticmethod + def _create_new_module(hra_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + new_module = HRALinear(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Conv2d): + new_module = HRAConv2d(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. " + "Currently, only `torch.nn.Linear` and `torch.nn.Conv2d` are supported." + ) + + return new_module diff --git a/peft/src/peft/tuners/ia3/__init__.py b/peft/src/peft/tuners/ia3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..21cab4d6d8a6437766f395e90079ce6ecd9e4f26 --- /dev/null +++ b/peft/src/peft/tuners/ia3/__init__.py @@ -0,0 +1,39 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.utils import register_peft_method + +from .config import IA3Config +from .layer import Conv2d, Conv3d, IA3Layer, Linear +from .model import IA3Model + + +__all__ = ["Conv2d", "Conv3d", "IA3Config", "IA3Layer", "IA3Model", "Linear"] + +register_peft_method(name="ia3", config_cls=IA3Config, model_cls=IA3Model, is_mixed_compatible=True) + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/ia3/bnb.py b/peft/src/peft/tuners/ia3/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..628e3ce7229528a0b3157da349b2b34153573c51 --- /dev/null +++ b/peft/src/peft/tuners/ia3/bnb.py @@ -0,0 +1,129 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available + +from .layer import IA3Layer + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, IA3Layer): + # (IA)^3 implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + is_feedforward: bool, + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + if self.disable_adapters: + return self.base_layer(x) + + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32) + if requires_conversion: + x = x.float() + if self.is_feedforward: + result = self.base_layer(x * ia3_scaling) + expected_dtype = result.dtype + else: + result = self.base_layer(x) + expected_dtype = result.dtype + result = result * ia3_scaling + + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "ia3." + rep + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, IA3Layer): + # IA3 implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + is_feedforward: bool, + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + if self.disable_adapters: + return self.base_layer(x) + + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32) + if requires_conversion: + x = x.float() + if self.is_feedforward: + result = self.base_layer(x * ia3_scaling) + expected_dtype = result.dtype + else: + result = self.base_layer(x) + expected_dtype = result.dtype + result = result * ia3_scaling + + result = result.clone() + # adalora.py and lora.py both suggest that this is necessary for 4-bit training on older versions of Pytorch. + # This has been duplicated here. + + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "ia3." + rep diff --git a/peft/src/peft/tuners/ia3/config.py b/peft/src/peft/tuners/ia3/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1c4161e5ed97998017b800535d4315c66c9d14cd --- /dev/null +++ b/peft/src/peft/tuners/ia3/config.py @@ -0,0 +1,112 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class IA3Config(PeftConfig): + """ + This is the configuration class to store the configuration of a [`IA3Model`]. + + Args: + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + feedforward_modules (`Optional[Union[List[str], str]]`): + The names of the modules to be treated as feedforward modules, as in the original paper. These modules will + have (IA)³ vectors multiplied to the input, instead of the output. `feedforward_modules` must be a name or + a subset of names present in `target_modules`. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from (IA)³ layers to be set as trainable and saved in the final checkpoint. + init_ia3_weights (`bool`): + Whether to initialize the vectors in the (IA)³ layers, defaults to `True`. Setting this to `False` is + discouraged. + """ + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with (IA)³." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually." + ), + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from (IA)³."}, + ) + feedforward_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or a regex expression of module names which are feedforward" + "For example, ['output.dense']" + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_ia3_weights: bool = field( + default=True, + metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."}, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.IA3 + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + self.feedforward_modules = ( + set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules + ) + + # check if feedforward_modules is a subset of target_modules. run the check only if both are sets + if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set): + if not self.feedforward_modules.issubset(self.target_modules): + raise ValueError("`feedforward_modules` should be a subset of `target_modules`") diff --git a/peft/src/peft/tuners/ia3/layer.py b/peft/src/peft/tuners/ia3/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..48cb08ba46aa7af05f1d891507e39ee6300f10dc --- /dev/null +++ b/peft/src/peft/tuners/ia3/layer.py @@ -0,0 +1,330 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, Optional + +import torch +import torch.nn as nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils import transpose + + +class IA3Layer(BaseTunerLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ("ia3_l",) + + def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None: + self.base_layer = base_layer + self.ia3_l = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.is_feedforward = is_feedforward + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, (nn.Conv2d, nn.Conv3d)): + in_features, out_features = base_layer.in_channels, base_layer.out_channels + elif isinstance(base_layer, nn.Embedding): + in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + self.in_features = in_features + self.out_features = out_features + + def update_layer(self, adapter_name, init_ia3_weights, inference_mode: bool = False, **kwargs): + # This code works for linear layers, override for other layer types + # Actual trainable parameters + if self.is_feedforward: + weight = torch.randn((1, self.in_features)) + else: + weight = torch.randn((self.out_features, 1)) + self.ia3_l[adapter_name] = nn.Parameter(weight) + if init_ia3_weights: + self.reset_ia3_parameters(adapter_name) + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_ia3_parameters(self, adapter_name): + if adapter_name in self.ia3_l.keys(): + # initialize learned vector with torch.ones + nn.init.constant_(self.ia3_l[adapter_name], 1.0) + + +class Linear(nn.Module, IA3Layer): + # (IA)^3 implemented in a dense layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_feedforward: bool = False, # Set to True if the layer is treated as a feedforward layer + is_target_conv_1d_layer: bool = False, # whether target module is a conv1d layer. useful while unloading later + init_ia3_weights: bool = True, # whether to initialize IA3 weights + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + self.fan_in_fan_out = fan_in_fan_out + self.is_target_conv_1d_layer = is_target_conv_1d_layer + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + orig_dtype = base_layer.weight.data.dtype + if safe_merge: + orig_weights = base_layer.weight.data + orig_weights = torch.mul(orig_weights, ia3_l) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights.to(orig_dtype) + else: + base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l).to(orig_dtype) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + orig_dtype = base_layer.bias.data.dtype + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data).to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + warnings.warn("Unmerge result can be inaccurate for (IA)^3.") + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + # Add tolerace to avoid division by zero + ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8 + orig_dtype = base_layer.weight.data.dtype + base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l).to(orig_dtype) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + orig_dtype = base_layer.bias.data.dtype + base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8).to(orig_dtype) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + dtype = previous_dtype = x.dtype + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + dtype = self.ia3_l[active_adapter].dtype + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + if self.is_feedforward: + x = x.to(dtype) + # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype + # e.g. bf16 vs fp32. Is that okay? + interm = (x * ia3_scaling).to(previous_dtype) + result = self.base_layer(interm, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result_dtype = result.dtype + result = (result * ia3_scaling).to(result_dtype) + + return result + + +class _ConvNd(nn.Module, IA3Layer): + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_feedforward: bool = False, # Set to True if the layer is treated as a feedforward layer + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self._kernel_dim = base_layer.weight.dim() + + self.update_layer(adapter_name, init_ia3_weights) + + def update_layer(self, adapter_name, init_ia3_weights, inference_mode: bool = False, **kwargs): + # Actual trainable parameters + num_features = self.in_features if self.is_feedforward else self.out_features + weights_size = (1, num_features) + (1,) * (self._kernel_dim - 2) + weight = torch.randn(weights_size) + self.ia3_l[adapter_name] = nn.Parameter(weight) + if init_ia3_weights: + self.reset_ia3_parameters(adapter_name) + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.data.dtype + ia3_scaling = self.ia3_l[active_adapter].data + if not self.is_feedforward: + ia3_scaling = ia3_scaling.transpose(0, 1) + + if safe_merge: + output_weight = torch.mul(base_layer.weight.data, ia3_scaling).clone() + + if not torch.isfinite(output_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = output_weight.to(orig_dtype) + else: + base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_scaling).to(orig_dtype) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data).to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + warnings.warn("Unmerge result can be inaccurate for (IA)^3.") + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.data.dtype + # divide by (IA)^3 vector. Add tolerace to avoid division by zero + ia3_scaling = self.ia3_l[active_adapter].data + if not self.is_feedforward: + ia3_scaling = ia3_scaling.transpose(0, 1) + base_layer.weight.data = torch.div(base_layer.weight.data, ia3_scaling + 1e-8).to(orig_dtype) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + orig_dtype = base_layer.bias.data.dtype + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data).to(orig_dtype) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + dtype = previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + dtype = self.ia3_l[active_adapter].dtype + ia3_scaling *= self.ia3_l[active_adapter] + + if self.is_feedforward: + x = x.to(dtype) + # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype + # e.g. bf16 vs fp32. Is that okay? + interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype) + result = self.base_layer(interm, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result = result.to(dtype) * ia3_scaling + + result = result.to(previous_dtype) + return result + + +class Conv2d(_ConvNd): + # IA3 implemented in a 2D convolutional layer + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self._kernel_dim == 4: + raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}") + + +class Conv3d(_ConvNd): + # IA3 implemented in a 3D convolutional layer + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self._kernel_dim == 5: + raise ValueError(f"Conv2d layer kernel must have 5 dimensions, not {self._kernel_dim}") diff --git a/peft/src/peft/tuners/ia3/model.py b/peft/src/peft/tuners/ia3/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e322b167fc53f4dadc8ba068c36a6e65d3e22f37 --- /dev/null +++ b/peft/src/peft/tuners/ia3/model.py @@ -0,0 +1,315 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import re +import warnings +from dataclasses import replace + +import torch +from transformers.pytorch_utils import Conv1D + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + _freeze_adapter, + _get_submodules, +) + +from .layer import Conv2d, Conv3d, IA3Layer, Linear + + +class IA3Model(BaseTuner): + """ + Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained + transformers model. The method is described in detail in https://huggingface.co/papers/2205.05638 + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`IA3Config`]): The configuration of the (IA)^3 model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The (IA)^3 model. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM, ia3Config + >>> from peft import IA3Model, IA3Config + + >>> config = IA3Config( + ... peft_type="IA3", + ... task_type="SEQ_2_SEQ_LM", + ... target_modules=["k", "v", "w0"], + ... feedforward_modules=["w0"], + ... ) + + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> ia3_model = IA3Model(config, model) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model. + """ + + prefix: str = "ia3_" + tuner_layer_cls = IA3Layer + + @staticmethod + def _create_new_module(ia3_config, adapter_name, target, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import Linear8bitLt + + if is_bnb_4bit_available(): + from .bnb import Linear4bit + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + is_feedforward = kwargs.pop("is_feedforward", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, is_feedforward=is_feedforward, **eightbit_kwargs) + elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, is_feedforward=is_feedforward, **fourbit_kwargs) + elif isinstance(target, torch.nn.Conv2d): + new_module = Conv2d(target, adapter_name, is_feedforward=is_feedforward, **kwargs) + elif isinstance(target, torch.nn.Conv3d): + new_module = Conv3d(target, adapter_name, is_feedforward=is_feedforward, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False + new_module = Linear(target, adapter_name, is_feedforward=is_feedforward, **kwargs) + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True + new_module = Linear( + target, adapter_name, is_feedforward=is_feedforward, is_target_conv_1d_layer=True, **kwargs + ) + else: + raise ValueError( + f"Target module {target} is not supported. " + f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported." + ) + return new_module + + def _create_and_replace( + self, + ia3_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + # check if target module is in feedforward_modules + is_feedforward = self._check_target_module_feedforward(ia3_config, current_key) + + kwargs = { + "fan_in_fan_out": ia3_config.fan_in_fan_out, + "init_ia3_weights": ia3_config.init_ia3_weights, + "is_feedforward": is_feedforward, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + + if isinstance(target, IA3Layer): + target.update_layer( + adapter_name, + ia3_config.init_ia3_weights, + ) + else: + new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _check_target_module_feedforward(ia3_config, key) -> bool: + """ + A helper private method that checks if the target module `key` matches with a feedforward module specified in + `ia3_config` + """ + if isinstance(ia3_config.feedforward_modules, str): + is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key)) + else: + is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules) + return is_feedforward + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + if peft_config.feedforward_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING: + raise ValueError("Please specify `feedforward_modules` in `peft_config`") + peft_config.feedforward_modules = set( + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge(self, *args, **kwargs): + r""" + This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model + as a standalone model. + + Args: + safe_merge (`bool`, `optional`, defaults to `False`): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + if getattr(self.model, "is_loaded_in_8bit", False): + raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode") + + if getattr(self.model, "is_loaded_in_4bit", False): + raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode") + + return super()._unload_and_optionally_merge(*args, **kwargs) + + def _check_add_weighted_adapter(self, adapters: list[str]) -> tuple[str, str]: + """ + Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying + model. + """ + # Validate existence of adapters + for adapter in adapters: + if adapter not in self.peft_config: + raise ValueError(f"Adapter {adapter} does not exist") + + # Check for conflicting modules_to_save + modules_to_save_wrappers = [module for module in self.modules() if isinstance(module, ModulesToSaveWrapper)] + if any( + sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1 for wrapper in modules_to_save_wrappers + ): + raise ValueError("Cannot add weighted adapters targeting the same module with modules_to_save.") + + # Ensure all adapters have compatible target and feedforward module types + target_module_types = {type(self.peft_config[adapter].target_modules) for adapter in adapters} + feedforward_module_types = {type(self.peft_config[adapter].feedforward_modules) for adapter in adapters} + if len(target_module_types) > 1 or len(feedforward_module_types) > 1: + raise ValueError("All adapter configs should have the same type for target and feedforward modules.") + + # Combine target and feedforward modules + if str in target_module_types: + new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters) + else: + new_target_modules = set.union(*(self.peft_config[adapter].target_modules for adapter in adapters)) + + if str in feedforward_module_types: + new_feedforward_modules = "|".join( + f"({self.peft_config[adapter].feedforward_modules})" for adapter in adapters + ) + else: + new_feedforward_modules = set.union( + *(self.peft_config[adapter].feedforward_modules for adapter in adapters) + ) + + return new_target_modules, new_feedforward_modules + + def add_weighted_adapter( + self, + adapters: list[str], + weights: list[float], + adapter_name: str, + ) -> None: + """ + This method adds a new adapter by merging the given adapters with the given weights. + + Args: + adapters (`list`): + List of adapter names to be merged. + weights (`list`): + List of weights for each adapter. + adapter_name (`str`): + Name of the new adapter. + """ + if adapter_name in list(self.peft_config.keys()): + return + + new_target_modules, new_feedforward_modules = self._check_add_weighted_adapter( + adapters=adapters, + ) + + self.peft_config[adapter_name] = replace( + self.peft_config[adapters[0]], + target_modules=new_target_modules, + feedforward_modules=new_feedforward_modules, + ) + self.inject_adapter(self.model, adapter_name) + + # Do we really need that? + _freeze_adapter(self.model, adapter_name) + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, IA3Layer): + if adapter_name in target.ia3_l: + target_ia3_l = target.ia3_l[adapter_name] + else: + continue + + target_ia3_l.data = target_ia3_l.data.zero_() + for adapter, weight in zip(adapters, weights): + if adapter in target.ia3_l: + current_adapter_ia3_l = target.ia3_l[adapter] + else: + continue + target_ia3_l.data += current_adapter_ia3_l.data * weight diff --git a/peft/src/peft/tuners/ln_tuning/__init__.py b/peft/src/peft/tuners/ln_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f90a8fb058cf75450b89166bf2e562f07b835fd --- /dev/null +++ b/peft/src/peft/tuners/ln_tuning/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import LNTuningConfig +from .model import LNTuningModel + + +__all__ = ["LNTuningConfig", "LNTuningModel"] + +register_peft_method(name="ln_tuning", config_cls=LNTuningConfig, model_cls=LNTuningModel) diff --git a/peft/src/peft/tuners/ln_tuning/config.py b/peft/src/peft/tuners/ln_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..127ee9017333bce22706b58b13701049244b1da9 --- /dev/null +++ b/peft/src/peft/tuners/ln_tuning/config.py @@ -0,0 +1,70 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class LNTuningConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a :class:`~peft.tuners.LNTuningModel`. + + Args: + target_modules (`Optional[Union[List[str], str]]`): + List of module names or regex expression of the module names to replace with LNTuning. For example, + '.*decoder.*' or '.*encoder.*'. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + modules_to_save (`Optional[Union[List[str], str]]`): + List of modules to be set as trainable and saved in the final checkpoint. For example, in Sequence + Classification or Token Classification tasks, the final layer `classifier/score` are randomly initialized + and as such need to be trainable and saved. + """ + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with LNTuning." + "For example, '.*decoder.*' or '.*encoder.*'. " + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you shoud specify the target modules manually." + ), + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from LNTuning."}, + ) + modules_to_save: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of modules to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.LN_TUNING diff --git a/peft/src/peft/tuners/ln_tuning/layer.py b/peft/src/peft/tuners/ln_tuning/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e29149f2cbbc023161873a91772fd642296052e0 --- /dev/null +++ b/peft/src/peft/tuners/ln_tuning/layer.py @@ -0,0 +1,123 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from copy import deepcopy +from typing import Optional + +import torch +import torch.nn as nn + +from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge + + +class LNTuningLayer(nn.Module, BaseTunerLayer): + """ + Selects a layer from the model. + """ + + adapter_layer_names = ("ln_tuning_layers",) + + def __init__(self, base_layer: nn.Module, adapter_name: str): + super().__init__() + self.base_layer = base_layer + self.ln_tuning_layers = nn.ModuleDict({}) + self.update_layer(self.base_layer, adapter_name) + self._active_adapter = adapter_name + self.merged_adapters = [] + + in_features, out_features = _get_in_out_features(self.get_base_layer()) + self.in_features = in_features + self.out_features = out_features + + def update_layer(self, layer: nn.Module, adapter_name: str, inference_mode: bool = False, **kwargs): + self.ln_tuning_layers[adapter_name] = deepcopy(layer) + self.set_adapter(adapter_name, inference_mode=inference_mode) + + def enable_adapters(self, enabled: bool) -> None: + """Toggle the enabling and disabling of adapters + + Takes care of setting the requires_grad flag for the adapter weights. + + Args: + enabled (bool): True to enable adapters, False to disable adapters + """ + if enabled: + self.set_adapter(self.active_adapters) + self._disable_adapters = False + else: + if self.merged: + self.unmerge() + # disable grads on all adapter layers + for layer_name in self.adapter_layer_names: + layer = getattr(self, layer_name) + layer.requires_grad_(False) + self._disable_adapters = True + + def merge(self, adapter_names: Optional[list[str]] = None, safe_merge: bool = False): + # note that there is no actual merging, so whether safe_merge is True or False is irrelevant + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + if len(adapter_names) > 1: + raise ValueError( + f"Trying to merge {len(adapter_names)} adapters, but LN " + f"tuning does not allow merging more than one adapter at a time" + ) + merged_adapters = set(self.merged_adapters) + if merged_adapters: + warnings.warn(f"Already merged with {merged_adapters}. Unmerging first.") + self.unmerge() + + self.base_layer, self.ln_tuning_layers[adapter_names[0]] = ( + self.ln_tuning_layers[adapter_names[0]], + self.base_layer, + ) + self.merged_adapters.append(adapter_names[0]) + + def unmerge(self): + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + # popping one element is sufficient because LN + # tuning does not allow merging more than one adapter at a time. + merged_name = self.merged_adapters.pop() + self.base_layer, self.ln_tuning_layers[merged_name] = ( + self.ln_tuning_layers[merged_name], + self.base_layer, + ) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + if len(self.active_adapters) != 1: + raise ValueError( + f"Trying to run forward with {len(self.active_adapters)} active " + f"adapters, but LN tuning does not allow inference with more than one adapter at a time" + ) + active_adapter = self.active_adapters[0] + result = self.ln_tuning_layers[active_adapter](x, *args, **kwargs) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "ln_tuning." + rep diff --git a/peft/src/peft/tuners/ln_tuning/model.py b/peft/src/peft/tuners/ln_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e193d22da8e08216e0a3800b27dccc4d60298855 --- /dev/null +++ b/peft/src/peft/tuners/ln_tuning/model.py @@ -0,0 +1,132 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Optional + +from torch.nn.modules import Module +from tqdm import tqdm + +from peft.config import PeftConfig +from peft.tuners.tuners_utils import BaseTuner, _get_submodules +from peft.utils import TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING + +from .layer import LNTuningLayer + + +class LNTuningModel(BaseTuner): + """ + Creates LayerNorm tuning from a pretrained transformer model. + + The method is described in detail in https://huggingface.co/papers/2312.11420. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`LNTuningConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + This option has no effect on LN tuning but exists for consistency with other PEFT methods. + + Returns: + 'torch.nn.Module': The adapted model with LayerNorm tuned on. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import get_peft_model, TaskType, LNTuningConfig + + >>> peft_config = LNTuningConfig( + ... task_type=TaskType.CAUSAL_LM, + ... ) + + >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") + >>> model = get_peft_model(model, peft_config) + >>> model.print_trainable_parameters() + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`LNTuningConfig`]): The configuration of the Lora model. + """ + + prefix: str = "ln_tuning_" + tuner_layer_cls = LNTuningLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + peft_config: PeftConfig, + adapter_name: str, + target: Module, + target_name: str, + parent: Module, + current_key: str, + ) -> None: + # replace the original module with a same new module + new_module = self._create_new_module(peft_config, target, adapter_name) + if adapter_name != self.active_adapter: + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + def _create_new_module( + self, + peft_config: PeftConfig, + target: Module, + adapter_name: str, + ) -> Module: + if not isinstance(target, LNTuningLayer): + new_module = LNTuningLayer(target, adapter_name) + else: + new_module = target + new_module.update_layer(target.base_layer, adapter_name) + return new_module + + def _unloading_checks(self, adapter_names: Optional[list[str]]): + adapters_to_consider = adapter_names or self.active_adapters + is_modules_to_save_available = any( + self.peft_config[adapter].modules_to_save for adapter in adapters_to_consider + ) + if is_modules_to_save_available and len(adapters_to_consider) > 1: + raise ValueError("Cannot unload multiple adapters that specify `modules_to_save`.") + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + self._unloading_checks(adapter_names) + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + desc = "Unloading adapters " + ("and merging " if merge else "") + "model" + + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + target.merge(adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + + return self.model + + def _cast_adapter_dtype(self, adapter_name: str, autocast_adapter_dtype: bool = True) -> None: + # Note: LN Tuning does not add adapter layers, instead it creates copies of the original layer. For this reason, + # we need to skip adapter autocasting, otherwise we would change the dtype of copies of the original layer, + # resulting in dtype errors down the line. + pass diff --git a/peft/src/peft/tuners/loha/__init__.py b/peft/src/peft/tuners/loha/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..70dd1545bbd49b0aaa5eafbde05f82b6a04c3d74 --- /dev/null +++ b/peft/src/peft/tuners/loha/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import LoHaConfig +from .layer import Conv2d, Linear, LoHaLayer +from .model import LoHaModel + + +__all__ = ["Conv2d", "Linear", "LoHaConfig", "LoHaLayer", "LoHaModel"] + +register_peft_method(name="loha", config_cls=LoHaConfig, model_cls=LoHaModel, prefix="hada_", is_mixed_compatible=True) diff --git a/peft/src/peft/tuners/loha/config.py b/peft/src/peft/tuners/loha/config.py new file mode 100644 index 0000000000000000000000000000000000000000..79c1f630130133c393c18e09a2624795f9353157 --- /dev/null +++ b/peft/src/peft/tuners/loha/config.py @@ -0,0 +1,143 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.tuners.lycoris_utils import LycorisConfig +from peft.utils import PeftType + + +@dataclass +class LoHaConfig(LycorisConfig): + """ + This is the configuration class to store the configuration of a [`LoHaModel`]. + + Args: + r (`int`): + LoHa rank. + alpha (`int`): + The alpha parameter for LoHa scaling. + rank_dropout (`float`): + The dropout probability for rank dimension during training. + module_dropout (`float`): + The dropout probability for disabling LoHa modules during training. + use_effective_conv2d (`bool`): + Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 ("Proposition 3" from FedPara + paper). + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (`bool`): + Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is + discouraged. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `alpha`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field(default=8, metadata={"help": "LoHa rank"}) + alpha: int = field(default=8, metadata={"help": "LoHa alpha"}) + rank_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for rank dimension during training"} + ) + module_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for disabling LoHa modules during training"} + ) + use_effective_conv2d: bool = field( + default=False, + metadata={ + "help": ( + "Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 " + '("Proposition 3" from FedPara paper)' + ) + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with LoHa." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from LoHa."}, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`." + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.LOHA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") diff --git a/peft/src/peft/tuners/loha/layer.py b/peft/src/peft/tuners/loha/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..96f9b1e01671a134784f6f0f2c183c7dcbf9f1c2 --- /dev/null +++ b/peft/src/peft/tuners/loha/layer.py @@ -0,0 +1,444 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.lycoris_utils import LycorisLayer + + +class LoHaLayer(nn.Module, LycorisLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2") + # other_param_names is defined on parent class + + def __init__(self, base_layer: nn.Module): + super().__init__() + LycorisLayer.__init__(self, base_layer) + + # LoHa info + self.hada_w1_a = nn.ParameterDict({}) + self.hada_w1_b = nn.ParameterDict({}) + self.hada_w2_a = nn.ParameterDict({}) + self.hada_w2_b = nn.ParameterDict({}) + self.hada_t1 = nn.ParameterDict({}) + self.hada_t2 = nn.ParameterDict({}) + + @property + def _available_adapters(self) -> set[str]: + return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2} + + def create_adapter_parameters(self, adapter_name: str, r: int, shape: tuple[int, ...]): + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75 + if len(shape) == 4: # Conv2d + self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + + self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + elif len(shape) == 3: # Conv1d + self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + + self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + else: # Linear + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r)) + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) + + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r)) + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) + + def reset_adapter_parameters(self, adapter_name: str): + # Original implementation performs initialization with normal distribution + # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158 + + # FedPara paper proposes to perform He initialization, let's stick with it + # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization + if adapter_name in self.hada_w1_a.keys(): + nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.zeros_(self.hada_w2_b[adapter_name]) + if adapter_name in self.hada_t1.keys(): + nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5)) + + def reset_adapter_parameters_random(self, adapter_name: str): + # Original implementation performs initialization with normal distribution + # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158 + + # FedPara paper proposes to perform He initialization, let's stick with it + # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization + if adapter_name in self.hada_w1_a.keys(): + nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_b[adapter_name], a=math.sqrt(5)) + if adapter_name in self.hada_t1.keys(): + nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5)) + + def update_layer( + self, + adapter_name: str, + r: int, + alpha: float, + rank_dropout: float, + module_dropout: float, + init_weights: bool, + use_effective_conv2d: bool = False, + inference_mode: bool = False, + **kwargs, + ) -> None: + """Internal function to create loha adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + alpha (`float`): Alpha for the added adapter. + rank_dropout (`float`): The dropout probability for rank dimension during training. + module_dropout (`float`): The dropout probability for disabling adapter during training. + init_weights (`bool`): Whether to initialize weights. + use_effective_conv2d (`bool`, *optional*, defaults to `False`): + Use parameter effective decomposition for Conv2d with ksize > 1. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.alpha[adapter_name] = alpha + self.scaling[adapter_name] = alpha / r + self.rank_dropout[adapter_name] = rank_dropout + self.module_dropout[adapter_name] = module_dropout + + # Determine shape of LoHa weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + shape = tuple(base_layer.weight.shape) + elif isinstance(base_layer, nn.Conv2d): + # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # they can be more efficiently handled with the flattened weight representation, + # similar to how Linear layers work. This optimization reduces computational cost + # without affecting the mathematical equivalence of the operation. + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + if use_effective_conv2d: + shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) + else: + shape = ( + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], + ) + elif isinstance(base_layer, nn.Conv1d): + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # to a Linear layer applied across the channel dimension. Using flattened representation + # avoids unnecessary reshaping and improves computational efficiency. + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size[0] != 1 + if use_effective_conv2d: + shape = (base_layer.out_channels, base_layer.in_channels, base_layer.kernel_size[0]) + else: + shape = ( + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0], + ) + else: + raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}") + + # Create weights with provided shape + self.create_adapter_parameters(adapter_name, r, shape) + + # Initialize weights + if init_weights: + self.reset_adapter_parameters(adapter_name) + else: + self.reset_adapter_parameters_random(adapter_name) + + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L178 + if adapter_name in self.hada_t1.keys(): + weight = make_weight_cp( + self.hada_t1[adapter_name], + self.hada_w1_a[adapter_name], + self.hada_w1_b[adapter_name], + self.hada_t2[adapter_name], + self.hada_w2_a[adapter_name], + self.hada_w2_b[adapter_name], + scale=torch.tensor(self.scaling[adapter_name]), + ) + else: + weight = make_weight( + self.hada_w1_a[adapter_name], + self.hada_w1_b[adapter_name], + self.hada_w2_a[adapter_name], + self.hada_w2_b[adapter_name], + scale=torch.tensor(self.scaling[adapter_name]), + ) + + base_layer = self.get_base_layer() + + # Reshape to match base layer shape + weight = weight.reshape(base_layer.weight.shape) + + # Perform rank dropout during training - drop rows of addition weights + rank_dropout = self.rank_dropout[adapter_name] + if self.training and rank_dropout: + drop = (torch.rand(weight.size(0)) > rank_dropout).to(weight.dtype) + drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device) + # TODO: Investigate if there should be a scaler like in normal dropout during training + # Original implementation doesn't have it + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L193 + drop /= drop.mean() + weight *= drop + + return weight + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + # Execute all the adapters + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + module_dropout = self.module_dropout[active_adapter] + + # Modify current execution weights + if (not self.training) or (self.training and torch.rand(1) > module_dropout): + result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs) + + result = result.to(previous_dtype) + return result + + +class Linear(LoHaLayer): + """LoHa implemented in Linear layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + return F.linear(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + +class Conv2d(LoHaLayer): + """LoHa implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv2d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + +class Conv1d(LoHaLayer): + """LoHa implemented in Conv1d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv1d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + +# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9 + + +class HadaWeight(torch.autograd.Function): + @staticmethod + def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)): + ctx.save_for_backward(w1a, w1b, w2a, w2b, scale) + diff_weight = ((w1a @ w1b) * (w2a @ w2b)) * scale + return diff_weight + + @staticmethod + def backward(ctx, grad_out): + (w1a, w1b, w2a, w2b, scale) = ctx.saved_tensors + grad_out = grad_out * scale + temp = grad_out * (w2a @ w2b) + grad_w1a = temp @ w1b.T + grad_w1b = w1a.T @ temp + + temp = grad_out * (w1a @ w1b) + grad_w2a = temp @ w2b.T + grad_w2b = w2a.T @ temp + + del temp + return grad_w1a, grad_w1b, grad_w2a, grad_w2b, None + + +class HadaWeightCP(torch.autograd.Function): + @staticmethod + def forward(ctx, t1, w1a, w1b, t2, w2a, w2b, scale=torch.tensor(1)): + ctx.save_for_backward(t1, w1a, w1b, t2, w2a, w2b, scale) + + rebuild1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a) + rebuild2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a) + + return rebuild1 * rebuild2 * scale + + @staticmethod + def backward(ctx, grad_out): + (t1, w1a, w1b, t2, w2a, w2b, scale) = ctx.saved_tensors + grad_out = grad_out * scale + + temp = torch.einsum("i j k l, j r -> i r k l", t2, w2b) + rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w2a) + + grad_w = rebuild * grad_out + del rebuild + + grad_w1a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w) + grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w1a.T) + del grad_w, temp + + grad_w1b = torch.einsum("i r k l, i j k l -> r j", t1, grad_temp) + grad_t1 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w1b.T) + del grad_temp + + temp = torch.einsum("i j k l, j r -> i r k l", t1, w1b) + rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w1a) + + grad_w = rebuild * grad_out + del rebuild + + grad_w2a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w) + grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w2a.T) + del grad_w, temp + + grad_w2b = torch.einsum("i r k l, i j k l -> r j", t2, grad_temp) + grad_t2 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w2b.T) + del grad_temp + return grad_t1, grad_w1a, grad_w1b, grad_t2, grad_w2a, grad_w2b, None + + +def make_weight(w1a, w1b, w2a, w2b, scale): + return HadaWeight.apply(w1a, w1b, w2a, w2b, scale) + + +def make_weight_cp(t1, w1a, w1b, t2, w2a, w2b, scale): + return HadaWeightCP.apply(t1, w1a, w1b, t2, w2a, w2b, scale) diff --git a/peft/src/peft/tuners/loha/model.py b/peft/src/peft/tuners/loha/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c39be6434d90fe47a8bdc7aae990d0ca4aaab230 --- /dev/null +++ b/peft/src/peft/tuners/loha/model.py @@ -0,0 +1,116 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch +from torch import nn + +from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner +from peft.utils import TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING +from peft.utils.other import get_pattern_key + +from .layer import Conv1d, Conv2d, Linear, LoHaLayer + + +class LoHaModel(LycorisTuner): + """ + Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in + https://huggingface.co/papers/2108.06098 Current implementation heavily borrows from + https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`LoHaConfig`]): The configuration of the LoHa model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The LoHa model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import LoHaModel, LoHaConfig + + >>> config_te = LoHaConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = LoHaConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... use_effective_conv2d=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = LoHaModel(model.text_encoder, config_te, "default") + >>> model.unet = LoHaModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model. + """ + + prefix: str = "hada_" + tuner_layer_cls = LoHaLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING + layers_mapping: dict[type[torch.nn.Module], type[LoHaLayer]] = { + torch.nn.Conv2d: Conv2d, + torch.nn.Conv1d: Conv1d, + torch.nn.Linear: Linear, + } + + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LoHaLayer, nn.Module], + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + r_key = get_pattern_key(config.rank_pattern.keys(), current_key) + alpha_key = get_pattern_key(config.alpha_pattern.keys(), current_key) + kwargs = config.to_dict() + kwargs["r"] = config.rank_pattern.get(r_key, config.r) + kwargs["alpha"] = config.alpha_pattern.get(alpha_key, config.alpha) + + if isinstance(target, LoHaLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) diff --git a/peft/src/peft/tuners/lokr/__init__.py b/peft/src/peft/tuners/lokr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f4fe0e92c6eacd0cc276ad3584230ea58ee4a8a6 --- /dev/null +++ b/peft/src/peft/tuners/lokr/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import LoKrConfig +from .layer import Conv2d, Linear, LoKrLayer +from .model import LoKrModel + + +__all__ = ["Conv2d", "Linear", "LoKrConfig", "LoKrLayer", "LoKrModel"] + +register_peft_method(name="lokr", config_cls=LoKrConfig, model_cls=LoKrModel, is_mixed_compatible=True) diff --git a/peft/src/peft/tuners/lokr/config.py b/peft/src/peft/tuners/lokr/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6d25dc5c129f711b59e848a38a63fac766d78754 --- /dev/null +++ b/peft/src/peft/tuners/lokr/config.py @@ -0,0 +1,155 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.tuners.lycoris_utils import LycorisConfig +from peft.utils import PeftType + + +@dataclass +class LoKrConfig(LycorisConfig): + """ + Configuration class of [`LoKrModel`]. + + Args: + r (`int`): + LoKr rank. + alpha (`int`): + The alpha parameter for LoKr scaling. + rank_dropout (`float`): + The dropout probability for rank dimension during training. + module_dropout (`float`): + The dropout probability for disabling LoKr modules during training. + use_effective_conv2d (`bool`): + Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 ("Proposition 3" from FedPara + paper). + decompose_both (`bool`): + Perform rank decomposition of left kronecker product matrix. + decompose_factor (`int`): + Kronecker product decomposition factor. + rank_dropout_scale ('bool) + Whether to scale the rank dropout while training, defaults to `False`. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (`bool`): + Whether to perform initialization of adapter weights. This defaults to `True`. Use "lycoris" to initialize + weights in the style of the LYCORIS repository. Passing `False` is discouraged. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `alpha`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field(default=8, metadata={"help": "LoKr rank"}) + alpha: int = field(default=8, metadata={"help": "LoKr alpha"}) + rank_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for rank dimension during training"} + ) + module_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for disabling LoKr modules during training"} + ) + use_effective_conv2d: bool = field( + default=False, + metadata={ + "help": ( + "Use parameter effective decomposition for Conv2d (and Conv1d) with ksize > 1 " + '("Proposition 3" from FedPara paper)' + ) + }, + ) + decompose_both: bool = field( + default=False, + metadata={"help": "Perform rank decomposition of left kronecker product matrix."}, + ) + decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."}) + rank_dropout_scale: bool = field(default=False, metadata={"help": "Rank dropout scale"}) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with LoKr." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from LoKr."}, + ) + init_weights: Union[bool, Literal["lycoris"]] = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the LoKr layers with their default initialization. Can be True, False or 'lycoris'." + "Default is True. Don't change this setting to False, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`." + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.LOKR + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") diff --git a/peft/src/peft/tuners/lokr/layer.py b/peft/src/peft/tuners/lokr/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..295193bfac5f4cb2d62df3e465e9633e71ca8ed3 --- /dev/null +++ b/peft/src/peft/tuners/lokr/layer.py @@ -0,0 +1,511 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.lycoris_utils import LycorisLayer + + +class LoKrLayer(nn.Module, LycorisLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ( + "lokr_w1", + "lokr_w1_a", + "lokr_w1_b", + "lokr_w2", + "lokr_w2_a", + "lokr_w2_b", + "lokr_t2", + ) + # other_param_names is defined on parent class + + def __init__(self, base_layer: nn.Module) -> None: + super().__init__() + LycorisLayer.__init__(self, base_layer) + + # LoKr info + self.lokr_w1 = nn.ParameterDict({}) + self.lokr_w1_a = nn.ParameterDict({}) + self.lokr_w1_b = nn.ParameterDict({}) + self.lokr_w2 = nn.ParameterDict({}) + self.lokr_w2_a = nn.ParameterDict({}) + self.lokr_w2_b = nn.ParameterDict({}) + self.lokr_t2 = nn.ParameterDict({}) + + @property + def _available_adapters(self) -> set[str]: + return { + *self.lokr_w1, + *self.lokr_w1_a, + *self.lokr_w1_b, + *self.lokr_w2, + *self.lokr_w2_a, + *self.lokr_w2_b, + *self.lokr_t2, + } + + def create_adapter_parameters( + self, + adapter_name: str, + r: int, + shape, + use_w1: bool, + use_w2: bool, + use_effective_conv2d: bool, + ): + if use_w1: + self.lokr_w1[adapter_name] = nn.Parameter(torch.empty(shape[0][0], shape[1][0])) + else: + self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r)) + self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0])) + + # Handle both Conv2d and Conv1d + if len(shape) == 4: # Conv2d + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:])) + elif use_effective_conv2d: + self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1])) # b, 1-mode + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) # d, 2-mode + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3])) + elif len(shape) == 3: # Conv1d + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], shape[2])) + elif use_effective_conv2d: # Even for Conv1d, use the effective parameter for kernel dimension + # We pass (r, r, kernel_size, 1) in order to be compatible with the 2d assumptions made + # in make_weight_cp (only relevant for the effective conv2d case). + self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], 1)) + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1])) # b, 1-mode + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) # d, 2-mode + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2])) + else: + # Linear + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1])) + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) + + def reset_adapter_parameters(self, adapter_name: str): + if adapter_name in self.lokr_w1: + nn.init.zeros_(self.lokr_w1[adapter_name]) + else: + nn.init.zeros_(self.lokr_w1_a[adapter_name]) + nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_w2: + nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_t2: + nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5)) + + def reset_adapter_parameters_random(self, adapter_name: str): + if adapter_name in self.lokr_w1: + nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_w2: + nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_t2: + nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5)) + + # Initializes weight matrices similar to the way initialized in the LyCORIS repository. + def reset_adapter_parameters_lycoris_way(self, adapter_name): + if adapter_name in self.lokr_w1: + nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_w2: + nn.init.zeros_(self.lokr_w2[adapter_name]) + else: + nn.init.zeros_(self.lokr_w2_b[adapter_name]) + nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_t2: + nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5)) + + def update_layer( + self, + adapter_name: str, + r: int, + alpha: float, + rank_dropout: float, + module_dropout: float, + init_weights: bool, + use_effective_conv2d: bool, + decompose_both: bool, + decompose_factor: int, + inference_mode: bool = False, + **kwargs, + ) -> None: + """Internal function to create lokr adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + alpha (`float`): Alpha for the added adapter. + rank_dropout (`float`): The dropout probability for rank dimension during training + module_dropout (`float`): The dropout probability for disabling adapter during training. + init_weights (`bool`): Whether to initialize adapter weights. + use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1. + decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix. + decompose_factor (`int`): Kronecker product decomposition factor. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.alpha[adapter_name] = alpha + self.scaling[adapter_name] = alpha / r + self.rank_dropout[adapter_name] = rank_dropout + self.module_dropout[adapter_name] = module_dropout + self.rank_dropout_scale[adapter_name] = kwargs["rank_dropout_scale"] + base_layer = self.get_base_layer() + + # Determine shape of LoKr weights + if isinstance(base_layer, nn.Linear): + in_dim, out_dim = base_layer.in_features, base_layer.out_features + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n)) # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2) + use_effective_conv2d = False + elif isinstance(base_layer, nn.Conv2d): + in_dim, out_dim = base_layer.in_channels, base_layer.out_channels + k_size = base_layer.kernel_size + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n), *k_size) # ((a, b), (c, d), *k_size) + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 + # For 1x1 convolutions, disable effective_conv2d to avoid unnecessary tensor reshaping overhead. + # Since 1x1 convolutions are essentially pointwise operations (matrix multiplications), + # they can be more efficiently handled with the flattened weight representation, + # similar to how Linear layers work. This optimization reduces computational cost + # without affecting the mathematical equivalence of the operation. + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + elif isinstance(base_layer, nn.Conv1d): + in_dim, out_dim = base_layer.in_channels, base_layer.out_channels + k_size = (base_layer.kernel_size[0],) # Convert to a tuple with single element + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n), *k_size) # ((a, b), (c, d), k) + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 + # For Conv1d with kernel_size=1, disable effective_conv2d for the same optimization reasons + # as 1x1 Conv2d. Kernel size 1 means no spatial/temporal context, making it equivalent + # to a Linear layer applied across the channel dimension. Using flattened representation + # avoids unnecessary reshaping and improves computational efficiency. + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size[0] != 1 + else: + raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}") + + # Create weights with provided shape + self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d) + + # Initialize weights + if init_weights: + if init_weights == "lycoris": + self.reset_adapter_parameters_lycoris_way(adapter_name) + else: + self.reset_adapter_parameters(adapter_name) + else: + self.reset_adapter_parameters_random(adapter_name) + + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224 + if adapter_name in self.lokr_w1: + w1 = self.lokr_w1[adapter_name] + else: + w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name] + + if adapter_name in self.lokr_w2: + w2 = self.lokr_w2[adapter_name] + elif adapter_name in self.lokr_t2: + w2 = make_weight_cp(self.lokr_t2[adapter_name], self.lokr_w2_a[adapter_name], self.lokr_w2_b[adapter_name]) + else: + w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name] + + # Make weights with Kronecker product + weight = make_kron(w1, w2, self.scaling[adapter_name]) + + # Get base layer for reshaping + base_layer = self.get_base_layer() + + # Regular reshape to match base layer shape + weight = weight.reshape(base_layer.weight.shape) + + # Perform rank dropout during training - drop rows of addition weights + rank_dropout = self.rank_dropout[adapter_name] + if self.training and rank_dropout: + drop = (torch.rand(weight.size(0)) > rank_dropout).float() + drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device) + if self.rank_dropout_scale[adapter_name]: + drop /= drop.mean() + weight *= drop + + return weight + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + # Execute all the adapters + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + module_dropout = self.module_dropout[active_adapter] + + # Modify current execution weights + if (not self.training) or (self.training and torch.rand(1) > module_dropout): + result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs) + + result = result.to(previous_dtype) + return result + + +class Linear(LoKrLayer): + """LoKr implemented in Linear layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + return F.linear(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + +class Conv2d(LoKrLayer): + """LoKr implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv2d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + +class Conv1d(LoKrLayer): + """LoKr implemented in Conv1d layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + input = self._cast_input_dtype(input, delta_weight.dtype) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv1d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + +# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11 + + +def factorization(dimension: int, factor: int = -1) -> tuple[int, int]: + """Factorizes the provided number into the product of two numbers + + Args: + dimension (`int`): The number that needs to be factorized. + factor (`int`, optional): + Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the + factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the + square root of the dimension. Defaults to -1. + + Returns: + Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is + always less than or equal to the second. + + Example: + ```py + >>> factorization(256, factor=-1) + (16, 16) + + >>> factorization(128, factor=-1) + (8, 16) + + >>> factorization(127, factor=-1) + (1, 127) + + >>> factorization(128, factor=4) + (4, 32) + ``` + """ + + if factor > 0 and (dimension % factor) == 0: + m = factor + n = dimension // factor + return m, n + if factor == -1: + factor = dimension + m, n = 1, dimension + length = m + n + while m < n: + new_m = m + 1 + while dimension % new_m != 0: + new_m += 1 + new_n = dimension // new_m + if new_m + new_n > length or new_m > factor: + break + else: + m, n = new_m, new_n + if m > n: + n, m = m, n + return m, n + + +def make_weight_cp(t, wa, wb): + rebuild2 = torch.einsum("i j k l, i p, j r -> p r k l", t, wa, wb) # [c, d, k1, k2] + return rebuild2 + + +def make_kron(w1, w2, scale=1.0): + if len(w2.shape) == 4: + w1 = w1.unsqueeze(2).unsqueeze(2) + w2 = w2.contiguous() + rebuild = torch.kron(w1, w2) + + return rebuild * scale diff --git a/peft/src/peft/tuners/lokr/model.py b/peft/src/peft/tuners/lokr/model.py new file mode 100644 index 0000000000000000000000000000000000000000..10e187a4bf2272af72481f261d92601ee67112fb --- /dev/null +++ b/peft/src/peft/tuners/lokr/model.py @@ -0,0 +1,118 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Union + +import torch +from torch import nn + +from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner +from peft.utils import TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING +from peft.utils.other import get_pattern_key + +from .layer import Conv1d, Conv2d, Linear, LoKrLayer + + +class LoKrModel(LycorisTuner): + """ + Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in + https://huggingface.co/papers/2108.06098 and in https://huggingface.co/papers/2309.14859 Current implementation + heavily borrows from + https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`LoKrConfig`]): The configuration of the LoKr model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The LoKr model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import LoKrModel, LoKrConfig + + >>> config_te = LoKrConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = LoKrConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... use_effective_conv2d=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default") + >>> model.unet = LoKrModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr model. + """ + + prefix: str = "lokr_" + tuner_layer_cls = LoKrLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING + layers_mapping: dict[type[torch.nn.Module], type[LoKrLayer]] = { + torch.nn.Conv2d: Conv2d, + torch.nn.Conv1d: Conv1d, + torch.nn.Linear: Linear, + } + + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LoKrLayer, nn.Module], + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + r_key = get_pattern_key(config.rank_pattern.keys(), current_key) + alpha_key = get_pattern_key(config.alpha_pattern.keys(), current_key) + kwargs = config.to_dict() + kwargs["r"] = config.rank_pattern.get(r_key, config.r) + kwargs["alpha"] = config.alpha_pattern.get(alpha_key, config.alpha) + kwargs["rank_dropout_scale"] = config.rank_dropout_scale + + if isinstance(target, LoKrLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) diff --git a/peft/src/peft/tuners/lora/__init__.py b/peft/src/peft/tuners/lora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b8f08afdf00eb8cbfc1e827faa465fde6432e2ff --- /dev/null +++ b/peft/src/peft/tuners/lora/__init__.py @@ -0,0 +1,64 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available +from peft.utils import register_peft_method + +from .arrow import create_arrow_model +from .config import ArrowConfig, EvaConfig, LoftQConfig, LoraConfig, LoraRuntimeConfig +from .eva import get_eva_state_dict, initialize_lora_eva_weights +from .gptq import GPTQLoraLinear +from .layer import Conv2d, Conv3d, Embedding, Linear, LoraLayer, ParamWrapper +from .model import LoraModel + + +__all__ = [ + "ArrowConfig", + "Conv2d", + "Conv3d", + "Embedding", + "EvaConfig", + "GPTQLoraLinear", + "Linear", + "LoftQConfig", + "LoraConfig", + "LoraLayer", + "LoraModel", + "LoraRuntimeConfig", + "ParamWrapper", + "create_arrow_model", + "get_eva_state_dict", + "initialize_lora_eva_weights", +] + +register_peft_method(name="lora", config_cls=LoraConfig, model_cls=LoraModel, is_mixed_compatible=True) + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + if (name == "EetqLoraLinear") and is_eetq_available(): + from .eetq import EetqLoraLinear + + return EetqLoraLinear + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/lora/aqlm.py b/peft/src/peft/tuners/lora/aqlm.py new file mode 100644 index 0000000000000000000000000000000000000000..81c7cdbb4ec45ed4d86fd08ab26f4baab64c0cab --- /dev/null +++ b/peft/src/peft/tuners/lora/aqlm.py @@ -0,0 +1,114 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +import torch + +from peft.import_utils import is_aqlm_available +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +if is_aqlm_available(): + from aqlm import QuantizedLinear + + +class AqlmLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ): + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + super().__init__() + LoraLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + result = self.base_layer(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 + # def reset_lora_parameters(self, adapter_name): + # if adapter_name in self.lora_A.keys(): + # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) + # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) + + +def dispatch_aqlm( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_aqlm_available() and isinstance(target_base_layer, QuantizedLinear): + new_module = AqlmLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.codes + + return new_module diff --git a/peft/src/peft/tuners/lora/arrow.py b/peft/src/peft/tuners/lora/arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..070f295112b9504dbdcc64d78b6ec2e19127a726 --- /dev/null +++ b/peft/src/peft/tuners/lora/arrow.py @@ -0,0 +1,476 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import os +from typing import Any + +import torch +from torch import nn +from transformers import PreTrainedModel + +from .config import ArrowConfig + + +TASK_ADAPTER_PREFIX = "task_" +GKS_ADAPTER_PREFIX = "gks_" + + +class ArrowLoraLinearLayer(nn.Module): + """ + This class represent the main logic of the arrow routing algorithm for linear layers. + """ + + def __init__(self, in_features, arrow_config): + super().__init__() + # extra parameters needed for arrow + self.in_features = in_features + self._protos_ready = False + self.top_k = arrow_config.top_k + self.temperature = arrow_config.router_temperature + self.rng_seed = arrow_config.rng_seed + self.task_adapter_names = ( + arrow_config.task_adapter_names.copy() + ) # Set in create_arrow_model() with this format: task_0, task_1, ... + self.gks_adapter_names = ( + arrow_config.gks_adapter_names + ) # Set in create_arrow_model() with this format: gks_0, gks_1, ... + self.use_gks = arrow_config.use_gks + self.gks_done = False + self.gks_added_adapter_names = [] + self.in_features = in_features + self.cast_input_dtype_enabled = True + + @torch.no_grad() + def on_adapter_change(self, lora_A, lora_B): + """ + Called when adapters are added/removed/renamed so Arrow can refresh its internal state before the next forward + pass. + """ + all_ts_adapter_names = [ + k + for k in lora_A.keys() + if k in lora_B and k != "arrow_router" and not (k.startswith("gks_") and k[len("gks_") :].isdigit()) + ] + + if sorted(self.task_adapter_names) == sorted(all_ts_adapter_names): # No changes in the ts_adapters + return + + # Getting the name(s) of added adapter(s) + if len(self.task_adapter_names) < len(all_ts_adapter_names): # Adapter(s) are added. + self.gks_added_adapter_names = [x for x in all_ts_adapter_names if x not in self.task_adapter_names] + + # Updating the task_adapter_names + self.task_adapter_names = all_ts_adapter_names.copy() + # Invalidate caches so they’ll be rebuilt lazily on next forward() + self._protos_ready = False + # GKS will be handled by self.gks_added_adapter_names + + def top_right_singular_vec_from_BA(self, A, B, iters=15, eps=1e-8): + """ + Computes the top *right* singular vector of ΔW = B @ A without forming ΔW. + + Theory: + For any matrix M, the right singular vectors are the eigenvectors of Mᵀ M. If ΔW = B @ A (with A ∈ + ℝ^{r×in}, B ∈ ℝ^{out×r}), then + ΔWᵀ ΔW = (B @ A)ᵀ (B @ A) = Aᵀ (Bᵀ B) A ∈ ℝ^{in×in}. + Therefore, the dominant right singular vector of ΔW is the dominant eigenvector of M := Aᵀ (Bᵀ B) A. We + find it by *power iteration* on the linear operator + v ↦ Aᵀ (Bᵀ B) (A v), + which avoids materializing ΔW (out×in) or M (in×in). The result lives in the input/token space (size = + in_features), which is exactly what Arrow needs. (Right singular vectors ≡ eigenvectors of MᵀM; power + iteration converges to the dominant eigenvector under mild conditions.) + =============================== Practical notes: + - We perform all iteration in float32 for numerical stability, then cast back + to the LoRA dtype/device before storing/using the prototype. + - Convergence is checked with a simple fixed-iter cap (`iters`) and/or + `allclose` tolerance (`tol`). + - The returned vector is unique up to sign (±), as with any singular vector. + Downstream code should be sign-invariant. + """ + + # A: (r, in), B: (out, r) + A32 = A.to(torch.float32) + B32 = B.to(torch.float32) + C = B32.T @ B32 # (r, r) + + # Private RNG on A's device + gen = None + if self.rng_seed is not None: + gen = torch.Generator(device=A32.device.type) + gen.manual_seed(int(self.rng_seed)) + + # init vector in input space + v = torch.randn(A32.size(1), dtype=A32.dtype, device=A32.device, generator=gen) + v = v / (v.norm() + eps) + + for _ in range(iters): + # w = (ΔWᵀΔW) v = Aᵀ (BᵀB) (A v) + w = A32.T @ (C @ (A32 @ v)) + v = w / (w.norm() + eps) + + return v # fp32 + + @torch.no_grad() + def build_prototypes(self, lora_A, lora_B): + """ + Computes a prototype vector for each LoRA module in every layer by applying Singular Value Decomposition (SVD) + to the `lora_A` matrix and extracting the top right singular vector. + + These prototypes are later used to calculate the cosine similarity between each input token and each expert. + The resulting similarity scores serve as coefficients to compute a weighted average of the corresponding LoRA + modules, effectively routing each token through its most relevant experts. + + ** This prototype computation is done is done once for all experts and is re-done on newly added adapters.** + + Args: + lora_A : Matrices A in LoRA layer. + lora_B (optional): Matrices B in LoRA layer. Defaults to None. + """ + + if self._protos_ready: + return + protos = [] + for name in self.task_adapter_names: + A = lora_A[name].weight # (r, in_features) + B = lora_B[name].weight # (out_features, r) + + # Efficiently computing right singular vector of A @ B + proto32 = self.top_right_singular_vec_from_BA(A, B) + + proto = proto32.to(dtype=A.dtype, device=A.device) + protos.append(proto) + + proto_stack = torch.stack(protos, dim=0) # (E, in_features) + + # Register the prototypes buffer with correct dtype/device consistent with A and B weights + self.register_buffer("prototypes", proto_stack, persistent=False) + self._protos_ready = True + + @torch.no_grad() + def gen_know_sub(self, lora_A, lora_B): + """ + This function performs General Knowledge Subtraction. It takes an average of provided general_adapters, and + subtract it from each task_adapter. This subtraction tries to purify the task adapters, based on + "forgetting-via-negation" principle. Forgetting-via-negation is a task-arithmetic operation, explained in: + https://arxiv.org/abs/2212.04089 The task adapters will be more focused and isolated, enhancing the performance + on new tasks. + + Args: + lora_A : Matrices A in LoRA layer. + lora_B : Matrices A in LoRA layer. + """ + if not self.use_gks: + return + elif self.gks_done and not self.gks_added_adapter_names: + return + else: + # 1) compute average A/B over gks_adapter_names + avg_A = torch.stack([lora_A[n].weight for n in self.gks_adapter_names], dim=0).mean( + 0 + ) # shape (r, in_features) + avg_B = torch.stack([lora_B[n].weight for n in self.gks_adapter_names], dim=0).mean( + 0 + ) # shape (out_features, r) + + # 2) Subtract the average from task-specific experts + if self.gks_done is False: # GKS is done for all the experts, since it hasn't been done yet. + for name in self.task_adapter_names: + lora_A[name].weight.data.sub_(avg_A) + lora_B[name].weight.data.sub_(avg_B) + else: # GKS is only done on new added experts, since GKS has been done previously. + for name in self.gks_added_adapter_names: + lora_A[name].weight.data.sub_(avg_A) + lora_B[name].weight.data.sub_(avg_B) + + # 3) Set gks_done flag as true, so we won't do it again in ArrowLinearVariant.forward(). + self.gks_done = True + # Clearing the self.gks_added_adapter_names + self.gks_added_adapter_names = [] + + def _cast_input_dtype(self, x, dtype: torch.dtype): + """ + Whether to cast the dtype of the input of the forward method. + + Usually, we want to enable this to align the input dtype with the dtype of the weight, but by setting + layer.cast_input_dtype=False, this can be disabled if necessary. + + Enabling or disabling can be managed via the peft.helpers.disable_lora_input_dtype_casting context manager. + """ + if x is None: # useful e.g. if x is the bias, which can be None + return None + + cast_input_dtype_enabled = getattr(self, "cast_input_dtype_enabled", True) + if (not cast_input_dtype_enabled) or (x.dtype == dtype): + return x + return x.to(dtype=dtype) + + def forward(self, x, lora_A, lora_B, dropout, scaling): + """ + Applies Arrow routing inside a LoRA layer. + + Steps: + 1. Compute cosine similarity between each token representation and all adapter prototypes. + 2. Select the top-k experts per token and normalize their scores with a softmax. + 3. Project tokens into each selected expert’s low-rank space (A weights). + 4. Map back to the output space (B weights). + 5. Aggregate expert outputs via the weighted sum of their contributions. + 6. Apply dropout, scaling, and return the reshaped delta. + + - Conceptually, this is a Mixture-of-Experts (MoE) over LoRA adapters, + where coefficients are derived from prototype similarity. + + Returns: + delta: LoRA output adjustment computed by Arrow routing. + """ + x = self._cast_input_dtype(x, lora_A[self.task_adapter_names[0]].weight.dtype) + B, *rest, F_in = x.shape + tok = x.view(-1, F_in) # (t, F_in) + t, E = tok.size(0), self.prototypes.size(0) + + # We now turn scaling, which is a dict, to tensors in order to use them later + scales_tens = torch.tensor( + [scaling[n] for n in self.task_adapter_names], + device=tok.device, + dtype=tok.dtype, + ) # shape (E,) + + # 1) similarity — sign-agnostic + sim = torch.abs(tok @ self.prototypes.T) # (t, E) + + # 2) top-k + softmax over full E (non-top-k = -inf) + top_v, idx = torch.topk(sim, self.top_k, dim=1) + full_score = tok.new_full((t, E), float("-inf")) + full_score.scatter_(1, idx, top_v) + coeff = torch.softmax(full_score / self.temperature, dim=1) # (t, E) + + # 3) stack all A and B weights once + # A_stack: (E, r, in_features), B_stack: (E, out_features, r) + A_stack = torch.stack([lora_A[n].weight for n in self.task_adapter_names], dim=0) + B_stack = torch.stack([lora_B[n].weight for n in self.task_adapter_names], dim=0) + + # 4) project tokens into each expert’s low‑rank space: + # z[e] = tok @ A_e.T → shape (t, E, r) + z = torch.einsum("tf, erf -> ter", tok, A_stack) + + # 5) lift back each expert’s output: + # y[e] = z[e] @ B_e.T → shape (t, E, out_features) + y = torch.einsum("ter, eor -> teo", z, B_stack) + + # 6) apply per-expert scaling before the weighted sum + # y_scaled[t, e, o] = scales[e] * y[t, e, o] + y = y * scales_tens.view(1, -1, 1) + + # 6) weighted sum over experts: + # delta_flat[t,o] = Σ_e coeff[t,e] * y[t,e,o] + delta_flat = torch.einsum("te, teo -> to", coeff, y) # (t, out_features) + + # 7) dropout, scale, and reshape + delta = dropout(delta_flat) + out_dim = delta_flat.size(-1) + return delta.view(B, *rest, out_dim) + + +def check_loaded_lora_compatibility_arrow(model, adapter_names: list[str]): + """ + After loading all adapters into `model`, check they share: + - the same LoRA rank (r) + - identical weight shapes + - identical sets of target_modules + Returns (sorted list of target module names, agreed rank r). + """ + reference = None # {'r':…, 'shapes':(Ashape,Bshape), 'modules':set([...])} + + for name in adapter_names: + curr_modules = set() + curr_r = None + curr_shapes = None + + for full_name, module in model.named_modules(): + if hasattr(module, "lora_A") and name in module.lora_A: + A = module.lora_A[name].weight + B = module.lora_B[name].weight + mod_name = full_name.split(".")[-1] + curr_modules.add(mod_name) + # A has shape (r, in_features); B has shape (out_features, r) + curr_r = A.shape[0] + curr_shapes = (A.shape, B.shape) + + if reference is None: + reference = {"r": curr_r, "shapes": curr_shapes, "modules": curr_modules} + else: + if curr_r != reference["r"]: + raise ValueError(f"[{name}] rank mismatch: {curr_r} != {reference['r']}") + if curr_shapes != reference["shapes"]: + raise ValueError(f"[{name}] shape mismatch: {curr_shapes} != {reference['shapes']}") + if curr_modules != reference["modules"]: + raise ValueError( + f"[{name}] target_modules mismatch:\n" + f" this adapter -> {sorted(curr_modules)}\n" + f" reference -> {sorted(reference['modules'])}" + ) + + agreed_modules = sorted(reference["modules"]) + return agreed_modules, int(reference["r"]) + + +def ensure_adapters_target_linear_layers_only(model, adapter_names: list[str]): + """ + Validate that every module holding LoRA weights for any of `adapter_names` is Linear-like: nn.Linear, + bitsandbytes.nn.Linear4bit, nn.Conv1d, or transformers.models.gpt2.modeling_gpt2.Conv1D. If not, raise. + """ + import torch.nn as nn + + Linear4bit = None + try: + import bitsandbytes as bnb # type: ignore + + Linear4bit = bnb.nn.Linear4bit + except ImportError: + pass + + HFConv1D = None + try: + from transformers.models.gpt2.modeling_gpt2 import Conv1D as HFConv1D # type: ignore + except ImportError: + pass + + allowed_types = (nn.Linear, nn.Conv1d) + if Linear4bit is not None: + allowed_types = allowed_types + (Linear4bit,) + if HFConv1D is not None: + allowed_types = allowed_types + (HFConv1D,) + + offenders = [] + + for full_name, module in model.named_modules(): + if hasattr(module, "lora_A"): + for name in adapter_names: + if name in getattr(module, "lora_A", {}): + base = getattr(module, "base_layer", None) or getattr(module, "original_module", None) + layer_to_check = base if base is not None else module + + if not isinstance(layer_to_check, allowed_types): + offenders.append((name, full_name, type(layer_to_check).__name__)) + + if offenders: + lines = [ + "LoRA adapters must only target Linear-like layers " + "(nn.Linear, nn.Conv1d, HF Conv1D, or bitsandbytes.nn.Linear4bit). Found:" + ] + for name, full_name, tname in offenders: + lines.append(f" - adapter '{name}' on module '{full_name}' of type {tname}") + raise TypeError("\n".join(lines)) + + +def _resolve_adapter_source(path: str) -> tuple[str, str | None]: + """ + Resolve a user-provided adapter `path` into (model_id, subfolder). + + Supports: + - Local path to a folder that contains `adapter_config.json` + - Hub path with subfolder, e.g. "user/repo/ts_expert_0[/more/...]", which becomes: + model_id="user/repo", subfolder="ts_expert_0[/more/...]" + - Plain Hub repo id "user/repo" (no subfolder) + """ + if os.path.isdir(path): + if not os.path.isfile(os.path.join(path, "adapter_config.json")): + raise ValueError(f"Local adapter path '{path}' does not contain 'adapter_config.json'.") + return path, None + + parts = path.strip("/").split("/") + if len(parts) >= 2: + model_id = "/".join(parts[:2]) + if len(parts) > 2: + subfolder = "/".join(parts[2:]) + return model_id, subfolder + return model_id, None + + return path, None + + +def create_arrow_model( + base_model: PreTrainedModel, + task_specific_adapter_paths: list[str], + arrow_config: ArrowConfig, + general_adapter_paths: list[str] | None = None, + **adapter_kwargs: Any, +): + if task_specific_adapter_paths is None or len(task_specific_adapter_paths) == 0: + raise ValueError("`task_specific_adapter_paths` should contain at least one adapter path") + + from peft import LoraConfig, PeftModel + + model_id0, sub0 = _resolve_adapter_source(task_specific_adapter_paths[0]) + initial_ts_expert_name = f"{TASK_ADAPTER_PREFIX}0" + + first_kwargs = dict(adapter_kwargs) + if sub0 is not None and "subfolder" not in first_kwargs: + first_kwargs["subfolder"] = sub0 + + model = PeftModel.from_pretrained( + base_model, + model_id=model_id0, + adapter_name=initial_ts_expert_name, + **first_kwargs, + ) + + for i in range(1, len(task_specific_adapter_paths)): + ts_expert_name = f"{TASK_ADAPTER_PREFIX}{i}" + mid, sub = _resolve_adapter_source(task_specific_adapter_paths[i]) + more_kwargs = dict(adapter_kwargs) + if sub is not None and "subfolder" not in more_kwargs: + more_kwargs["subfolder"] = sub + model.load_adapter( + model_id=mid, + adapter_name=ts_expert_name, + **more_kwargs, + ) + arrow_config.task_adapter_names = [f"{TASK_ADAPTER_PREFIX}{i}" for i in range(len(task_specific_adapter_paths))] + + if arrow_config.use_gks: + if general_adapter_paths is None or len(general_adapter_paths) == 0: + raise ValueError("You should provide general LoRA paths if you want to use GenKnowSub.") + for i in range(len(general_adapter_paths)): + gen_expert_name = f"{GKS_ADAPTER_PREFIX}{i}" + mid, sub = _resolve_adapter_source(general_adapter_paths[i]) + gks_kwargs = dict(adapter_kwargs) + if sub is not None and "subfolder" not in gks_kwargs: + gks_kwargs["subfolder"] = sub + model.load_adapter( + model_id=mid, + adapter_name=gen_expert_name, + **gks_kwargs, + ) + arrow_config.gks_adapter_names = [f"{GKS_ADAPTER_PREFIX}{i}" for i in range(len(general_adapter_paths))] + else: + arrow_config.gks_adapter_names = [] + + target_modules, r = check_loaded_lora_compatibility_arrow( + model, adapter_names=arrow_config.task_adapter_names + arrow_config.gks_adapter_names + ) + + ensure_adapters_target_linear_layers_only( + model, adapter_names=arrow_config.task_adapter_names + arrow_config.gks_adapter_names + ) + + router_cfg = LoraConfig( + arrow_config=arrow_config, + target_modules=target_modules, + r=r, + ) + model.add_adapter(adapter_name="arrow_router", peft_config=router_cfg) + model.set_adapter("arrow_router") + + return model diff --git a/peft/src/peft/tuners/lora/awq.py b/peft/src/peft/tuners/lora/awq.py new file mode 100644 index 0000000000000000000000000000000000000000..61eb487ad6756e1e31b612ec7a8e4649b860e4c5 --- /dev/null +++ b/peft/src/peft/tuners/lora/awq.py @@ -0,0 +1,121 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib.metadata as importlib_metadata +from typing import Any, Optional + +import packaging.version +import torch + +from peft.import_utils import is_auto_awq_available +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +class AwqLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ): + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + super().__init__() + LoraLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + + def forward(self, x: torch.Tensor): + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result = result + output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_awq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_auto_awq_available(): + from awq.modules.linear import WQLinear_GEMM + + if isinstance(target_base_layer, WQLinear_GEMM): + # Raise the error only at the dispatch level + AUTOAWQ_MINIMUM_VERSION = packaging.version.parse("0.2.0") + version_autoawq = packaging.version.parse(importlib_metadata.version("autoawq")) + + if AUTOAWQ_MINIMUM_VERSION > version_autoawq: + raise ImportError( + f"Found an incompatible version of auto-awq. Found version {version_autoawq}, " + f"but only versions above {AUTOAWQ_MINIMUM_VERSION} are supported for PEFT." + ) + + new_module = AwqLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/peft/src/peft/tuners/lora/bnb.py b/peft/src/peft/tuners/lora/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..632abf5a66592c9bd5e4a76a3dabeca3c50a0b65 --- /dev/null +++ b/peft/src/peft/tuners/lora/bnb.py @@ -0,0 +1,611 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional + +import bitsandbytes as bnb +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.integrations import dequantize_bnb_weight +from peft.utils.other import transpose + +from .config import ArrowConfig +from .layer import LoraLayer, LoraVariant + + +VARIANT_KWARG_KEYS = ["alora_offsets"] + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_alora: bool = False, + use_dora: bool = False, + arrow_config: ArrowConfig = None, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_alora=use_alora, + lora_bias=lora_bias, + arrow_config=arrow_config, + ) + + def resolve_lora_variant( + self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, **kwargs + ) -> Optional[LoraVariant]: + if arrow_config is not None: + from .variants import ArrowLinearVariant + + return ArrowLinearVariant() + + if not use_dora and not use_alora: + return None + + from .variants import ALoraLinearVariant, DoraLinearVariant + + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + + warnings.warn( + "Merge lora module to 8-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8 + # dequantization directly + output = dequantize_bnb_weight(weight, state=state) + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output.to(lora_data.dtype).to(lora_data.device) + lora_data + else: + w_data = self.lora_variant[active_adapter].merge_safe(self, active_adapter, output) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + if self.lora_bias[active_adapter]: + bias_data = self.get_base_layer().bias.data + self.lora_B[active_adapter].bias + if safe_merge and not torch.isfinite(bias_data): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + self.get_base_layer().bias.data = bias_data + + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Unmerge lora module to 8-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + output = dequantize_bnb_weight(weight, state=state) + + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data + else: + w_data = self.lora_variant[active_adapter].unmerge(self, active_adapter, output) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + if self.lora_bias[active_adapter]: + self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias + state.reset_grads() + + def get_delta_weight(self, adapter): + return ( + transpose( + self.lora_B[adapter].weight @ self.lora_A[adapter].weight, + False, + ) + * self.scaling[adapter] + ) + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + result = self.base_layer(x, *args, **kwargs) + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]] + if active_adapter not in self.lora_variant: # vanilla LoRA: + output = lora_B(lora_A(dropout(sub_batch))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] += output + else: + alora_offsets = variant_kwargs.get("alora_offsets", None) + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] + output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=sub_batch, + result=result[sub_batch_indices_list[i]], + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] = output + + return result + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + if active_adapter not in self.lora_variant: # vanilla LoRA + output = lora_B(lora_A(dropout(x))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result = result + output + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + def dispatch_bnb_8bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target.state.has_fp16_weights, + "threshold": target.state.threshold, + "index": target.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs) + + return new_module + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + arrow_config: ArrowConfig = None, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + arrow_config=arrow_config, + ) + + def resolve_lora_variant( + self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, **kwargs + ) -> Optional[LoraVariant]: + if arrow_config is not None: + from .variants import ArrowLinearVariant + + return ArrowLinearVariant() + + if not use_dora and not use_alora: + return None + + from .variants import ALoraLinearVariant, DoraLinearVariant + + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + + warnings.warn( + "Merge lora module to 4-bit linear may get different generations due to rounding errors." + ) + # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930 + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + + output = dequantize_bnb_weight(weight, state=weight.quant_state) + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output + lora_data + else: + w_data = self.lora_variant[active_adapter].merge_safe(self, active_adapter, output) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + # torch.compile can introduce attributes preceded by '_', remove them + kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")} + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + if self.lora_bias[active_adapter]: + bias_data = self.get_base_layer().bias.data + self.lora_B[active_adapter].bias + if safe_merge and not torch.isfinite(bias_data): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + self.get_base_layer().bias.data = bias_data + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Unmerge lora module to 4-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + output = dequantize_bnb_weight(weight, state=weight.quant_state) + + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output - lora_data + else: + w_data = self.lora_variant[active_adapter].unmerge(self, active_adapter, output) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + if self.lora_bias[active_adapter]: + self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias + + def get_delta_weight(self, adapter): + return ( + transpose( + self.lora_B[adapter].weight @ self.lora_A[adapter].weight, + False, + ) + * self.scaling[adapter] + ) + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + result = self.base_layer(x, *args, **kwargs) + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]] + if active_adapter not in self.lora_variant: # vanilla LoRA + output = lora_B(lora_A(dropout(sub_batch))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] += output + else: + alora_offsets = variant_kwargs.get("alora_offsets", None) + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] + output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=sub_batch, + result=result[sub_batch_indices_list[i]], + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] = output + + return result + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + if active_adapter not in self.lora_variant: # vanilla LoRA + output = lora_B(lora_A(dropout(x))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result = result + output + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + def dispatch_bnb_4bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + if loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, **fourbit_kwargs) + + return new_module diff --git a/peft/src/peft/tuners/lora/config.py b/peft/src/peft/tuners/lora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..36ad31ceebe4385cf9be21efad8dd5ec854124bd --- /dev/null +++ b/peft/src/peft/tuners/lora/config.py @@ -0,0 +1,783 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from torch import nn + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class LoraRuntimeConfig: + """ + This is the sub-configuration class to store the runtime configurations for the model. + + Args: + ephemeral_gpu_offload (`bool`): + Whether to use ephemeral GPU offloading for models partially kept in CPU memory. + """ + + ephemeral_gpu_offload: bool = field( + default=False, + metadata={ + "help": ( + "Whether to use ephemeral GPU offloading for models partially kept in CPU memory. Ephemeral GPU offloading result in " + "the data involved in intense operations being momentarily copied over to the GPU, and the results copied " + "back to CPU. There is a momentary VRAM overhead, but operations are generally orders of magnitude faster " + "compared to performing them on the CPU. This is useful when parts of the model and/or components (such " + "as adapters) are kept in CPU memory until they are needed. Rather than perform expensive operations on " + "small data, the data is transferred to the GPU on-demand, the operation(s) performed, and the results " + "moved back to CPU memory. Currently only affects DoRA initialization." + ) + }, + ) + + +@dataclass +class LoftQConfig: + """ + This is the sub-configuration class to store the configuration of a [`LoraModel`]. + + Args: + bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the + default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}. + bits (`int`): Quantization bits for LoftQ. + iter (`int`): Alternating iterations for LoftQ. + fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear + models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4 + bits. + """ + + loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) + loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) + + +@dataclass +class ArrowConfig: + """ + This is the sub-configuration class to store the configuration for Arrow and GenKnowSub algorithm. Arrow is a + routing algorithm to combine the trained LoRA modules to solve new tasks, proposed in + 'https://arxiv.org/pdf/2405.11157'. GenKnowSub is a refinement on the trained modules before being combined via + Arrow, introduced in 'https://aclanthology.org/2025.acl-short.54/' + """ + + top_k: int = field( + default=3, + metadata={"help": "Number of top LoRA modules to combine in Arrow routing."}, + ) + + router_temperature: float = field( + default=1.0, + metadata={"help": "Softmax temperature for computing Arrow expert coefficients."}, + ) + + use_gks: bool = field( + default=False, + metadata={"help": "Enable GenKnowSub."}, + ) + + task_adapter_names: Optional[list[str]] = field( + default=None, + init=False, + metadata={"help": "list of task-specific LoRA adapter names. It will be set in create_arrow_model()."}, + ) + + gks_adapter_names: Optional[list[str]] = field( + default=None, + init=False, + metadata={ + "help": "list of general LoRA adapter names for GenKnowSub. It will be set in create_arrow_model()." + }, + ) + + rng_seed: Optional[int] = field( + default=None, + metadata={"help": "Optional RNG seed for reproducibility. If None, sampling is non-deterministic."}, + ) + + def __post_init__(self): + if self.top_k <= 0: + raise ValueError("top_k cannot be negative.") + if self.router_temperature <= 0: + raise ValueError("router_temperature must be greater than 0.") + + +@dataclass +class EvaConfig: + """ + This is the sub-configuration class to store the configuration for a data-driven initialization via EVA. EVA was + introduced in Explained Variance Adaptation. + + Args: + rho (`float`): + Rho value for EVA redistribution (>= 1.0). The maximum rank for a layer is lora_r * rho. Default is 2.0, + meaning the maximum rank allowed for a layer is 2r. Increasing rho will allow for a higher degree of + redistribution of ranks across layers. Some pre-trained models might be more sensitive to a rank + redistribution. It can therefore be beneficial to try rho=1.0 (no redistribution) if the performance is + lower than expected. + tau (`float`): + Cosine similarity threshold for early stopping. Compares the cosine similarity of right-singular vectors + between two consecutive SVD steps. If the cosine similarity is above this threshold, the SVD iteration is + stopped. Default is 0.99. + use_label_mask (`bool`): + Use label mask for EVA initialization. This means that positions where labels=label_mask_value are ignored + for the SVD computation. Setting use_label_mask=True is preferred in most cases and can be especially + beneficial for multi-turn conversations. The default value is True. Filtering out items based on the label + mask can sometimes lead to a small batch size and as a result instabilities in the SVD computation. For + cases where a large share of batch items would be filtered out, set use_label_mask=False. + label_mask_value (`int`): + If use_label_mask=True the value to look for to mask out ignored tokens. Default is -100. + whiten (`bool`): Apply whitening to singular vectors. Default is False. + Whitening has been shown to be beneficial for EVA in the vision domain. + adjust_scaling_factors (`bool`): + Adjust LoRA scaling factors after the rank redistribution. Setting this to True means the scaling factors + are adjusted so that all LoRA gradients have the same scale regardless of their rank. Default is True. + """ + + rho: float = field(default=2.0, metadata={"help": "Rho value for EVA redistribution"}) + tau: float = field(default=0.99, metadata={"help": "Cosine similarity threshold for early stopping"}) + use_label_mask: bool = field(default=True, metadata={"help": "Use label mask for EVA initialization"}) + label_mask_value: int = field( + default=-100, metadata={"help": "if use_label_mask=True the value to look for to mask out ignored tokens"} + ) + whiten: bool = field(default=False, metadata={"help": "Apply whitening to singular vectors"}) + adjust_scaling_factors: bool = field( + default=True, + metadata={"help": "Adjust LoRA scaling factors after the rank redistribution"}, + ) + + def __post_init__(self): + if self.rho < 1.0: + raise ValueError("`rho` must be >= 1.0") + if self.tau < 0.0 or self.tau > 1.0: + raise ValueError("`tau` must be between 0.0 and 1.0.") + + +@dataclass +class CordaConfig: + """ + This is the sub-configuration class to store the configuration of a [`LoraModel`]. + + Args: + cache_file (`Optional[str]`): + File to store the SVD cache. The SVD cache is much smaller than the residual model (for example, residual + model of Llama-3-8b is 15GB, while SVD cache is 1.4GB), but with SVD cache and original model weights, + residual model weights can be built quickly. If you need to reuse residual model weights with limited + storage, you can store the SVD cache instead. + covariance_file (`Optional[str]`): + File to store the covariance matrix. If you wish to train multiple models with different ranks, but they + sample from the same dataset, you can store the covariance matrix and reuse it for different ranks. Note + that covariance file is usually large (comparable to model size), so you will need sufficient storage. + corda_method (`Literal["ipm", "kpm"]`): + Method to build adapter. The KPM (Knowledge-Preserved Mode) not only achieves better performance than LoRA + on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. When + preserving pre-trained knowledge is not a concern, the IPM (Instruction-Previewed Mode) is favored because + it can further accelerate convergence and enhance the fine-tuning performance. Defaults to `'ipm'`. + verbose (`bool`): + If true, prints the progress of CorDA initialization. Defaults to `False`. + use_float16_for_covariance (`bool`): + If true, uses float16 for the covariance matrix. This can reduce the memory usage of the covariance matrix + by half, but may lead to numerical instability. Defaults to `False`. + prune_temporary_fields (`bool`): + If true, temporary fields generated in CorDA preprocessing will be pruned. Defaults to `True`. + """ + + cache_file: Optional[str] = field( + default=None, + metadata={ + "help": ( + "File to store the SVD cache. The SVD cache is much smaller than the residual model (for example, " + "residual model of Llama-3-8b is 15GB, while SVD cache is 1.4GB), but with SVD cache and original model " + "weights, residual model weights can be built quickly. If you need to reuse residual model weights with " + "limited storage, you can store the SVD cache instead." + ) + }, + ) + covariance_file: Optional[str] = field( + default=None, + metadata={ + "help": ( + "File to store the covariance matrix. If you wish to train multiple models with different ranks, but " + "they sample from the same dataset, you can store the covariance matrix and reuse it for different ranks. " + "Note that covariance file is usually large (comparable to model size), so you will need sufficient storage." + ) + }, + ) + corda_method: Literal["ipm", "kpm"] = field( + default="ipm", + metadata={ + "help": ( + "Method to build adapter. The KPM not only achieves better performance than LoRA on fine-tuning tasks, but " + "also mitigates the catastrophic forgetting of pre-trained world knowledge. When preserving pre-trained " + "knowledge is not a concern, the IPM is favored because it can further accelerate convergence and enhance " + "the fine-tuning performance." + ) + }, + ) + verbose: bool = field(default=False, metadata={"help": "If true, prints the progress of CorDA initialization."}) + use_float16_for_covariance: bool = field( + default=False, + metadata={ + "help": ( + "If true, uses float16 for the covariance matrix. This can reduce the memory usage of the covariance matrix " + "by half, but may lead to numerical instability." + ) + }, + ) + prune_temporary_fields: bool = field( + default=True, metadata={"help": "If true, temporary fields generated in CorDA preprocessing will be pruned."} + ) + + +@dataclass +class LoraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`LoraModel`]. + + Args: + r (`int`): + Lora attention dimension (the "rank"). + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen (if + the model is a PreTrainedModel, the output layer excluded). If this is not specified, modules will be + chosen according to the model architecture. If the architecture is not known, an error will be raised -- in + this case, you should specify the target modules manually. To avoid targeting any modules (because you want + to apply `target_parameters`), set `target_modules=[]`. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + lora_alpha (`int`): + The alpha parameter for Lora scaling. + lora_dropout (`float`): + The dropout probability for Lora layers. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + use_rslora (`bool`): + When set to True, uses [Rank-Stabilized LoRA](https://huggingface.co/papers/2312.03732) which sets the + adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better. Otherwise, it will + use the original default value of `lora_alpha/r`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"]`): + How to initialize the weights of the adapter layers. Passing True (default) results in the default + initialization from the reference implementation from Microsoft, with the LoRA B weight being set to 0. + This means that without further training, the LoRA adapter will be a no-op. Setting the initialization to + False leads to random initialization of LoRA A and B, meaning that LoRA is not a no-op before training; + this setting is intended for debugging purposes. Passing 'gaussian' results in Gaussian initialization + scaled by the LoRA rank for linear and layers. Pass `'loftq'` to use LoftQ initialization. Passing `'eva'` + results in a data-driven initialization of Explained + Variance Adaptation. EVA initializes LoRA based on the SVD of layer input activations and achieves SOTA + performance due to its ability to adapt to the finetuning data. Pass `'olora'` to use OLoRA initialization. + Passing `'pissa'` results in the initialization of Principal Singular values and Singular vectors Adaptation (PiSSA), which converges more rapidly than + LoRA and ultimately achieves superior performance. Moreover, PiSSA reduces the quantization error compared + to QLoRA, leading to further enhancements. Passing `'pissa_niter_[number of iters]'` initiates + Fast-SVD-based PiSSA initialization, where `[number of iters]` indicates the number of subspace iterations + to perform FSVD, and must be a nonnegative integer. When `[number of iters]` is set to 16, it can complete + the initialization of a 7B model within seconds, and the training effect is approximately equivalent to + using SVD. Passing `'corda'` results in the initialization of Context-Oriented Decomposition Adaptation, which + converges even more rapidly than PiSSA in Instruction-Previewed Mode, and preserves world knowledge better + than LoRA in Knowledge-Preserved Mode. Passing `"orthogonal"` results in LoRA A and B being intialized + orthogonally; in this, it resembles `"olora"`, but the base weights are left untouched (requires `r` to be + even, only supported for linear layers for now). + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `lora_alpha`. For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`. + megatron_config (`Optional[dict]`): + The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can + get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron. + The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this + parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron. + megatron_core (`Optional[str]`): + The core module from Megatron to use, defaults to `"megatron.core"`. + trainable_token_indices (`Optional[Union[List[int], dict[str, List[int]]]]`) + Lets you specify which token indices to selectively fine-tune without requiring to re-train the whole + embedding matrix using the `peft.TrainableTokensModel` method. You can specify token indices in two ways. + Either you specify a list of indices which will then target the model's input embedding layer (or, if not + found, `embed_tokens`). Alternatively, you can specify a dictionary where the key is the name of the + embedding module and the values are the list of token indices, e.g. `{'embed_tokens': [0, 1, ...]}`. Note + that training with FSDP requires `use_orig_params=True` to avoid issues with non-uniform `requires_grad`. + loftq_config (`Optional[LoftQConfig]`): + The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights + and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a + quantized model in this case, as LoftQ will quantize the model itself. + eva_config (`Optional[EvaConfig]`): + The configuration of EVA. At a minimum the dataset argument needs to be set (use the same dataset as for + finetuning). + corda_config (`Optional[CordaConfig]`): + The configuration of CorDA. If this is not None, then CorDA will be used to build the adapter layers. Also + pass `init_lora_weights='corda'`. + use_dora (`bool`): + Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights + into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is + handled by a separate learnable parameter. This can improve the performance of LoRA especially at low + ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure + LoRA, so it is recommended to merge weights for inference. For more information, see + https://huggingface.co/papers/2402.09353. + alora_invocation_tokens (`List[int]`): + If not None, enable 'Activated LoRA' (aLoRA), with + alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all model + input strings). This technique selectively activates the adapter weights only on tokens during and after + the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation is + interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in + inference pipelines involving switching between base model inference and adapter inference (e.g. agentic + pipelines, see paper for examples), significant savings are realized (relative to LoRA) by saving prefill + operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, + depending on the length of the shared context. Note that merging is not possible due to the selective + application of the weights. + layer_replication (`List[Tuple[int, int]]`): + Build a new stack of layers by stacking the original model layers according to the ranges specified. This + allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will + all have separate LoRA adapters attached to them. + runtime_config (`LoraRuntimeConfig`): + Runtime configurations (which are not saved or restored). + lora_bias (`bool`): + Defaults to `False`. Whether to enable the bias term for the LoRA B parameter. Typically, this should be + disabled. The main use case for this is when the LoRA weights were extracted from fully fine-tuned + parameters so the bias of those parameters can be taken into account. + target_parameters (`List[str]`, *optional*) + List of parameter names or regex expression of the parameter names to replace with LoRA. This argument + behaves similarly to `target_modules`, except that the parameter name should be passed. Generally, you + should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some circumstances, this + is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using + `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` method for LoRA, but for + `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with + `target_parameters`. As an example, for Llama4, you can pass: + `target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. Passing a + string for regex matching is not implemented yet. + """ + + r: int = field(default=8, metadata={"help": "Lora attention dimension"}) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with LoRA. " + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D " + "(if the model is a PreTrainedModel, the output layer excluded). " + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually. " + "To avoid targeting any modules (because you want to apply `target_parameters`), set " + "`target_modules=[]`." + ), + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from Lora."}, + ) + lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"}) + lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: Literal["none", "all", "lora_only"] = field( + default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"} + ) + use_rslora: bool = field( + default=False, + metadata={ + "help": ( + "When set to True, uses [Rank-Stabilized LoRA](https://huggingface.co/papers/2312.03732)" + " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it" + " was proven to work better. Otherwise, it will use the original default" + " value of `lora_alpha/r`." + ) + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_lora_weights: ( + bool + | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq", "orthogonal"] + ) = field( + default=True, + metadata={ + "help": ( + "How to initialize the weights of the LoRA layers. " + "Passing True (default) results in the default initialization from the reference implementation from " + "Microsoft, with the LoRA B weight being set to 0. This means that without further training, the LoRA " + "adapter will be a no-op. " + "Setting the initialization to False leads to random initialization of LoRA A and B, meaning that LoRA " + "is not a no-op before training; this setting is intended for debugging purposes. " + "Passing `'gaussian'` results in Gaussian initialization scaled by the LoRA rank for linear and layers. " + "Passing `'eva'` results in a data-driven initialization of Explained Variance Adaptation. " + "Passing `'olora'` results in OLoRA initialization. " + "Passing `'pissa'` results in PiSSA initialization. " + "Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, where " + "[number of iters] indicates the number of subspace iterations to perform fsvd, and must be a " + "nonnegative integer. " + "Passing `'corda'` results in CorDA initialization. " + "Pass `'loftq'` to use LoftQ initialization. " + "Pass `'orthogonal'` for orthogonal initialization of LoRA A and B." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + }, + ) + rank_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " + "For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`." + ) + }, + ) + alpha_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " + "For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`." + ) + }, + ) + megatron_config: Optional[dict] = field( + default=None, + metadata={ + "help": ( + "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer." + "You can get it like this, `core_transformer_config_from_args(get_args())`, " + "these two functions being from Megatron." + "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and " + "RowParallelLinear layers of megatron." + "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` " + "functions, because TransformerConfig may not necessarily be serialized." + "But when using megatron, we can use `get_peft_model_state_dict` function and " + "megatron's framework, they can also save and load models and configurations." + ) + }, + ) + megatron_core: Optional[str] = field( + default="megatron.core", + metadata={ + "help": ( + "The core module from Megatron, it is used to create LoRA's parallel linear layer. " + "It only needs to be passed in when you need to use your own modified megatron core module. " + "Otherwise, it will use the default value `megatron.core`. " + ) + }, + ) + trainable_token_indices: Optional[Union[list[int], dict[str, list[int]]]] = field( + default=None, + metadata={ + "help": ( + "Lets you specify which token indices to selectively fine-tune without requiring to re-train the " + "whole embedding matrix using the `peft.TrainableTokensModel` method. You can specify token indices " + "in two ways. Either you specify a list of indices which will then target the model's input embedding " + "layer (or, if not found, `embed_tokens`). Alternatively, you can specify a dictionary where the key " + "is the name of the embedding module and the values are the list of token indices, e.g. " + "`{'embed_tokens': [0, 1, ...]}`. Note that training with FSDP requires `use_orig_params=True` to " + "avoid issues with non-uniform `requires_grad`." + ) + }, + ) + # dict type is used when loading config.json + loftq_config: Union[LoftQConfig, dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " + "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." + ) + }, + ) + eva_config: Optional[EvaConfig] = field( + default=None, + metadata={ + "help": ( + "The configuration of EVA. If this is passed, then EVA will be used to initialize the LoRA layers. " + "Also set `init_lora_weights='eva'` in this case. " + ) + }, + ) + corda_config: Optional[CordaConfig] = field( + default=None, + metadata={ + "help": ( + "The configuration of CorDA. If this is passed, then CorDA will be used to build the adapter layers. " + "Also set `init_lora_weights='corda'` in this case." + ) + }, + ) + use_dora: bool = field( + default=False, + metadata={ + "help": ( + "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the " + "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " + "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " + "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger" + "overhead than pure LoRA, so it is recommended to merge weights for inference." + ) + }, + ) + alora_invocation_tokens: Optional[list[int]] = field( + default=None, + metadata={ + "help": ( + "If not None, enable 'Activated LoRA' (aLoRA), with " + "alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all model " + "input strings). This technique selectively activates the adapter weights only on tokens during and after " + "the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation is " + "interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in " + "inference pipelines involving switching between base model inference and adapter inference (e.g. agentic " + "pipelines, see paper for examples), significant savings are realized (relative to LoRA) by saving prefill " + "operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, " + "depending on the length of the shared context. Note that merging is not possible due to the selective " + "application of the weights." + ) + }, + ) + use_qalora: bool = field( + default=False, + metadata={ + "help": ( + "It is only implemented in GPTQ for now. Enable Quantization-Aware Low-Rank Adaptation (QALoRA)." + "This technique combines quantization-aware training " + "with LoRA to improve performance for quantized models. This can improve the performance of LoRA, " + "especially at low ranks. Right now, QALoRA only supports linear layers." + ) + }, + ) + qalora_group_size: int = field( + default=16, + metadata={ + "help": ( + "Group size parameter for QALoRA pooling, controlling the dimension reduction factor. " + "Input dimensions are pooled into groups of this size, reducing the computational cost. " + "Higher values provide more compression but may reduce model quality. " + "This parameter determines how many original features are averaged together to create " + "one pooled feature. Only used when `use_qalora=True`." + ) + }, + ) + # Enables replicating layers in a model to expand it to a larger model. + layer_replication: Optional[list[tuple[int, int]]] = field( + default=None, + metadata={ + "help": ( + "This enables using LoRA to effectively expand a transformer model to a larger size by repeating some layers. " + "The transformation handles models (currently Llama, Bert or Falcon compatible architectures) with " + "a module list in the model which it modifies to expand the number of modules. " + "Base weights are shared so the memory usage is close to the original model. The intended use is these base weights " + "remain fixed during finetuning but each layer has a separate LoRA adapter so the layers can be specialed via " + "the adapter layers fit during fine tuning." + "The format is a list of [start, end) pairs which specify the layer ranges to stack. For example:\n" + " Original model has 5 layers labelled by their position in the model: `[0, 1, 2, 3, 4]`\n" + " layer_replication: `[[0, 4], [2, 5]]`\n" + " Final model will have this arrangement of original layers: `[0, 1, 2, 3, 2, 3, 4]`\n" + "This format is based on what is used for pass-through merges in mergekit. It makes it simple to select sequential " + "ranges of a model and stack them while reusing layers at either end of each sequence." + ) + }, + ) + runtime_config: LoraRuntimeConfig = field( + default_factory=LoraRuntimeConfig, metadata={"help": "Runtime configurations"} + ) + lora_bias: bool = field( + default=False, + metadata={ + "help": ( + "Whether to enable the bias term for the LoRA B parameter. Typically, this should be disabled. The " + "main use case for this is when the LoRA weights were extracted from fully fine-tuned parameters so " + "the bias of those parameters can be taken into account." + ) + }, + ) + target_parameters: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of parameter names or regex expression of the parameter names to replace with LoRA. " + "This argument behaves similarly to `target_modules`, except that the parameter name should be passed. " + "Generally, you should use `target_modules` to target the module (e.g. `nn.Linear`). However, in some " + "circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, " + "instead of using `nn.Linear`, an `nn.Parameter` is used. PEFT normally overwrites the `forward` " + "method for LoRA, but for `nn.Parameter`, there is none. Therefore, to apply LoRA to that parameter, " + "it needs to be targeted with `target_parameters`. As an example, for Llama4, you can pass: " + "`target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]`. Passing a " + "string for regex matching is not implemented yet." + ) + }, + ) + arrow_config: Optional[ArrowConfig] = field( + default=None, metadata={"help": "The necessary config to apply arrow routing on the model."} + ) + + def to_dict(self): + """ + Returns the configuration for your adapter model as a dictionary. Removes runtime configurations. + """ + rv = super().to_dict() + rv.pop("runtime_config") + return rv + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.LORA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + if isinstance(self.target_parameters, str): + raise TypeError("`target_parameters` must be a list of strings or None.") + + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + + if self.use_dora and self.megatron_config: + raise ValueError("DoRA does not support megatron_core, please set `use_dora=False`.") + + # handle init_lora_weights and loftq_config + if self.init_lora_weights == "loftq": + import importlib + + if not importlib.util.find_spec("scipy"): + raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") + if not self.loftq_config: + raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.") + if not isinstance(self.loftq_config, dict): + # convert loftq_config to dict + self.loftq_config = vars(self.loftq_config) + elif self.loftq_config: + self.loftq_config = {} + warnings.warn("`loftq_config` specified but will be ignored when `init_lora_weights` is not 'loftq'.") + + elif self.init_lora_weights == "eva" and self.eva_config is None: + warnings.warn("`init_lora_weights` is 'eva' but `eva_config` is not specified. Using default EVA config.") + self.eva_config = EvaConfig() + elif self.init_lora_weights != "eva" and self.eva_config is not None: + warnings.warn("`eva_config` specified but will be ignored when `init_lora_weights` is not 'eva'.") + + elif self.init_lora_weights == "corda" and self.corda_config is None: + warnings.warn( + "`init_lora_weights` is 'corda' but `corda_config` is not specified. Using default CorDA config." + ) + self.corda_config = CordaConfig() + elif self.init_lora_weights != "corda" and self.corda_config is not None: + warnings.warn("`corda_config` specified but will be ignored when `init_lora_weights` is not 'corda'.") + + if self.lora_bias: + if self.init_lora_weights not in (True, False): + raise ValueError( + f"The argument lora_bias=True is only supported with init_lora_weights=True or False, got " + f"init_lora_weights={self.init_lora_weights} instead." + ) + if self.use_dora: + raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") + + if self.alora_invocation_tokens is not None and self.task_type != "CAUSAL_LM": + warnings.warn("aLoRA is currently only supported for CAUSAL_LM task.") + + # Using post training conversion of modified base weights to restore their initial values PiSSA/CorDA/OLoRA cannot + # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends + # this when they'll eventually call save_pretrained (i.e. if they'll pass + # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here. + if ( + self.use_rslora + and (self.rank_pattern or self.alpha_pattern) + and ( + (isinstance(self.init_lora_weights, str) and (self.init_lora_weights.startswith("pissa"))) + or (self.init_lora_weights == "olora") + or (self.init_lora_weights == "corda") + ) + ): + msg = ( + "Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion of modified " + "base weights PiSSA/CorDA/OLoRA means that you won't be able to pass " + "`path_initial_model_for_weight_conversion` to `save_pretrained` to restore the initial values of the " + "base weights; if you intend to do this, please ensure not to use rslora or rank_pattern/alpha_pattern." + ) + warnings.warn(msg) + + self._custom_modules: Optional[dict[type[nn.Module], type[nn.Module]]] = None + + def _register_custom_module(self, mapping: dict[type[nn.Module], type[nn.Module]]) -> None: + """ + Experimental API to support providing custom LoRA layers. + + This API is subject to change, you should carefully read the docs before deciding to use it: + + https://huggingface.co/docs/peft/developer_guides/custom_models + + To register custom LoRA module types, call this method with a `mapping` argument that is a dict that maps from + the target layer type to the custom LoRA layer type. The dict can contain multiple items if you wish to target + multiple layer types. The target layer type can be any nn.Module that we currently don't support in PEFT, + whether that is an official PyTorch layer type or a custom layer type. The custom LoRA module class has to be + implemented by the user and follow the PEFT conventions for LoRA layers. + + """ + if self._custom_modules is None: + self._custom_modules = {} + self._custom_modules.update(mapping) diff --git a/peft/src/peft/tuners/lora/corda.py b/peft/src/peft/tuners/lora/corda.py new file mode 100644 index 0000000000000000000000000000000000000000..d9eba35eb8a55ee76324c32e4bbfac127506633d --- /dev/null +++ b/peft/src/peft/tuners/lora/corda.py @@ -0,0 +1,360 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference code: https://github.com/iboing/CorDA/blob/main/cordalib/decomposition.py +# Reference paper: https://huggingface.co/papers/2406.05223 + +import os +from collections.abc import Iterable +from typing import Any, Callable, Optional + +import torch +import torch.nn as nn +from attr import dataclass +from tqdm import tqdm + +from peft.tuners.lora.config import LoraConfig +from peft.tuners.lora.model import LoraModel +from peft.utils.other import get_pattern_key + + +@dataclass +class CordaEigens: + S_WC: torch.Tensor + U_WC: torch.Tensor + V_WC: torch.Tensor + + +def target_modules(model: nn.Module, config: LoraConfig) -> Iterable[nn.Module]: + """ + Iterate over CorDA target name and modules of a model. A module is a target if its name is in + `config.target_modules` and is `nn.Linear`. + """ + for name, module in model.named_modules(): + if LoraModel._check_target_module_exists(config, name) and isinstance(module, nn.Linear): + yield name, module + + +def get_model_device(model: nn.Module) -> str: + if hasattr(model, "module"): # Handle DeepSpeed/DataParallel + model = model.module + return next(iter(model.parameters())).device.type + + +@torch.no_grad() +def preprocess_corda( + model: nn.Module, + lora_config: LoraConfig, + run_model: Optional[Callable[[], None]] = None, + hooked_model: Optional[nn.Module] = None, +): + """ + Build necessary CorDA fields for a model. + + For each `M * N` linear layer, a `M * M` covariance matrix will be built temporarily during the preprocessing + process, consuming roughly another `2 * MODEL_SIZE` memory for typical LLMs if model weight is FP16 and covariance + is FP32. If that's too much, consider specifying `use_float16_for_covariance` in `lora_config.corda_config`. + + Args: + model (`nn.Module`): + Model to preprocess. + lora_config (`LoraConfig`): + Lora configuration of the model. `lora_config.corda_config` should be set. + run_model (`Optional[Callable[[], None]]`): + Callback to run the model when building covariance. Typically you should run model inference on your sample + dataset in this callback. Experiments have shown that when token count per sample is 2048, hidden dimension + is 4096, collecting 256 distinct samples is enough. If you collect too few or too repetitive samples, the + covariance matrix may be low-ranked and unstabilize preprocessing. You can estimate sample count as + `HIDDEN_DIM / TOKEN_PER_SAMPLE * 128`. `run_model` can be `None` only if covariance file in + `lora_config.corda_config` is already created. + hooked_model (`Optional[nn.Module]`): + Model to hook when building covariance. If none, original model will be hooked. This is only useful when + you want to hook a different model than the one you are training, typically you should leave this `None`. + + Upon completion, the following fields are set for each target module: + eigens.S_WC (`torch.Tensor`): + Singular values of the weight matrix. + eigens.U_WC (`torch.Tensor`): + Left singular vectors of the weight matrix. + eigens.V_WC (`torch.Tensor`): + Right singular vectors of the weight matrix, multiplied by inverse of covariance matrix. + """ + cache_file = lora_config.corda_config.cache_file + covariance_file = lora_config.corda_config.covariance_file + corda_method = lora_config.corda_config.corda_method + verbose = lora_config.corda_config.verbose + prune_temporary_fields = lora_config.corda_config.prune_temporary_fields + + # If cache exists, skip building + if cache_file is not None and os.path.exists(cache_file) and os.path.getsize(cache_file) > 0: + cache = torch.load(cache_file, map_location=get_model_device(model)) + for name, module in target_modules(model, lora_config): + module.eigens = CordaEigens( + S_WC=cache[f"{name}.eigens.S_WC"], + U_WC=cache[f"{name}.eigens.U_WC"], + V_WC=cache[f"{name}.eigens.V_WC"], + ) + else: + # Specify CorDA method for each layer + if corda_method is None: + raise ValueError("corda_method is required when cache_file is not provided.") + for name, module in target_modules(model, lora_config): + module.corda_method = corda_method + + # Specify CorDA rank for each layer + for name, module in target_modules(model, lora_config): + r_key = get_pattern_key(lora_config.rank_pattern.keys(), name) + module.rank = lora_config.rank_pattern.get(r_key, lora_config.r) + + # Calculate covariance matrix + calib_cov_distribution(model, lora_config, run_model, hooked_model, covariance_file) + + # Calculate eigens + collect_eigens(model, lora_config, verbose) + + # Crop CorDA eigens so that there's less to save + crop_corda_eigens(model, lora_config) + + # Remove redundant fields if exist + if prune_temporary_fields: + for name, module in target_modules(model, lora_config): + if hasattr(module, "sample_count"): + del module.sample_count + if hasattr(module, "covariance_matrix"): + del module.covariance_matrix + if hasattr(module, "corda_method"): + del module.corda_method + if hasattr(module, "rank"): + del module.rank + + # Save cache to disk + if cache_file is not None: + cache: dict[str, Any] = {} + for name, module in target_modules(model, lora_config): + cache[f"{name}.eigens.S_WC"] = module.eigens.S_WC + cache[f"{name}.eigens.U_WC"] = module.eigens.U_WC + cache[f"{name}.eigens.V_WC"] = module.eigens.V_WC + + os.makedirs(os.path.dirname(cache_file), exist_ok=True) + torch.save(cache, cache_file) + + +@torch.no_grad() +def calib_cov_distribution( + model: nn.Module, + config: LoraConfig, + run_model: Optional[Callable[[], None]], + hooked_model: Optional[nn.Module], + covariance_file: Optional[str], +): + if covariance_file is not None and os.path.exists(covariance_file) and os.path.getsize(covariance_file) > 0: + all_covariance_matrix = torch.load(covariance_file, map_location=get_model_device(model)) + for name, module in target_modules(model, config): + module.covariance_matrix = all_covariance_matrix[name] + return + + if run_model is None: + raise ValueError("run_model must be specified when covariance file and cache file aren't built.") + if hooked_model is None: + hooked_model = model + hooked_model.eval() + + def hook(module, input, output): + input = input[0].detach().squeeze(0).data ## (context_length = 2048, dim) + if not config.corda_config.use_float16_for_covariance: + input = input.float() + input = input / torch.max(input).abs() + + # check if input is valid + if torch.isnan(input).any() or torch.isinf(input).any(): + raise ValueError("Invalid value found in input, please check your input data.") + + # calculate covariance and check if it's valid + covariance = input.t().matmul(input) + if torch.isnan(covariance).any() or torch.isinf(covariance).any(): + raise ValueError( + "Invalid value found in covariance. Please file an issue at https://github.com/huggingface/peft/issues." + ) + + # add to module + module.sample_count += 1 + module.covariance_matrix += covariance + + # free memory + del covariance, input + + handles = [] + for name, module in target_modules(hooked_model, config): + module.sample_count = 0 + module.covariance_matrix = 0 + handles.append(module.register_forward_hook(hook)) + + run_model() + + # Clear the hooks + for handle in handles: + handle.remove() + + # In some edge cases you might need to hook a model different from the model to add adapters, + # this case you would specify `hooked_model` and set it to a different model from `model`. + if hooked_model is not model: + targets = {} + for name, module in target_modules(model, config): + targets[name] = module + for name, module in target_modules(hooked_model, config): + # There can be modules used only in inference, but not training + # Exclude modules not in target model to prevent KeyError in this case + if name in targets: + targets[name].sample_count = module.sample_count + targets[name].covariance_matrix = module.covariance_matrix + + # Divide by sample count + for name, module in target_modules(model, config): + module.covariance_matrix /= module.sample_count + + # Save covariance to disk + if covariance_file is not None: + all_covariance_matrix = {} + for name, module in target_modules(model, config): + all_covariance_matrix[name] = module.covariance_matrix + os.makedirs(os.path.dirname(covariance_file), exist_ok=True) + torch.save(all_covariance_matrix, covariance_file) + + +@torch.no_grad() +def collect_eigens( + model: nn.Module, + config: LoraConfig, + verbose: bool, +): + """Call collect_eigens_for_layer and store result in key `eigens` of each layer.""" + linear_modules = [] + for name, module in target_modules(model, config): + linear_modules.append((name, module)) + if verbose: + linear_modules = tqdm(linear_modules, desc="Collecting eigens") + for name, module in linear_modules: + module.eigens = collect_eigens_for_layer(module, config) + + +@torch.no_grad() +def collect_eigens_for_layer( + linear: nn.Linear, + config: LoraConfig, +) -> CordaEigens: + w = linear.weight.data.float() + out_dim = w.size(0) + in_dim = w.size(1) + min_dim = min(in_dim, out_dim) + + if not hasattr(linear, "covariance_matrix"): + raise ValueError( + "Covariance matrix not found in linear module. Please do not call this function directly, " + "instead call `preprocess_corda`. If your usage is correct but this error still encounters, " + "please file an issue at https://github.com/huggingface/peft/issues." + ) + covariance_matrix = linear.covariance_matrix.float() + + damp = 0.01 + while True: + compensate = torch.diag( + torch.ones(covariance_matrix.size(0)).to(covariance_matrix.device) + * torch.mean(torch.diag(covariance_matrix)) + * damp + ) + fix_covariance_matrix = covariance_matrix + compensate + cov_inv = torch.linalg.inv(fix_covariance_matrix) + inv_error = torch.dist( + fix_covariance_matrix @ cov_inv, torch.eye(covariance_matrix.size(0)).to(get_model_device(linear)) + ).item() + if inv_error < 0.05: + break + else: + damp = damp * 2 + w = w @ fix_covariance_matrix ## w: out_dim, in_dim; covariance_matrix: in_dim, in_dim + + U, S, Vh = torch.linalg.svd(w, full_matrices=False) + V = (Vh @ cov_inv).transpose(0, 1) + + # Sanity check, temporarily U and V are large, they will be crop after rank search + r = min_dim + if U.size(0) != out_dim or U.size(1) != r: + raise ValueError( + f"Matrix U size mismatch: {U.size()} vs. ({out_dim}, {r}), " + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if S.size(0) != r: + raise ValueError( + f"Matrix S size mismatch: {S.size()} vs. ({r},), " + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if V.size(0) != in_dim or V.size(1) != r: + raise ValueError( + f"Matrix V size mismatch: {V.size()} vs. ({in_dim}, {r}), " + "please file an issue at https://github.com/huggingface/peft/issues." + ) + + # Offload U and V to CPU, they consume too much memory + U = U.cpu() + V = V.cpu() + return CordaEigens( + S_WC=S, + U_WC=U, + V_WC=V, + ) + + +@torch.no_grad() +def crop_corda_eigens(model: nn.Module, config: LoraConfig): + for name, module in target_modules(model, config): + # We don't expect saving sliced tensor writes the whole tensor to disk, + # so it's necessary to copy the tensors. + # Reference: https://github.com/pytorch/pytorch/issues/40157 + if module.corda_method == "ipm": + module.eigens.S_WC = module.eigens.S_WC[: module.rank].clone() + module.eigens.U_WC = module.eigens.U_WC[:, : module.rank].clone().to(get_model_device(model)) + module.eigens.V_WC = module.eigens.V_WC[:, : module.rank].clone().to(get_model_device(model)) + elif module.corda_method == "kpm": + module.eigens.S_WC = module.eigens.S_WC[-module.rank :].clone() + module.eigens.U_WC = module.eigens.U_WC[:, -module.rank :].clone().to(get_model_device(model)) + module.eigens.V_WC = module.eigens.V_WC[:, -module.rank :].clone().to(get_model_device(model)) + else: + raise ValueError(f"Invalid corda_method found: {module.corda_method}, it should be 'ipm' or 'kpm'.") + + # Sanity check + if module.eigens.S_WC.size(0) != module.rank: + raise ValueError( + f"rank mismatch: {module.eigens.S_WC.size(0)} vs. {module.rank}," + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if module.eigens.U_WC.size(0) != module.weight.size(0): + raise ValueError( + f"U size mismatch: {module.eigens.U_WC.size(0)} vs. {module.weight.size(0)}," + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if module.eigens.U_WC.size(1) != module.rank: + raise ValueError( + f"U size mismatch: {module.eigens.U_WC.size(1)} vs. {module.rank}," + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if module.eigens.V_WC.size(0) != module.weight.size(1): + raise ValueError( + f"V size mismatch: {module.eigens.V_WC.size(0)} vs. {module.weight.size(1)}," + "please file an issue at https://github.com/huggingface/peft/issues." + ) + if module.eigens.V_WC.size(1) != module.rank: + raise ValueError( + f"V size mismatch: {module.eigens.V_WC.size(1)} vs. {module.rank}," + "please file an issue at https://github.com/huggingface/peft/issues." + ) diff --git a/peft/src/peft/tuners/lora/dora.py b/peft/src/peft/tuners/lora/dora.py new file mode 100644 index 0000000000000000000000000000000000000000..f38a7df125651dca198286cd3bbde0dea148784b --- /dev/null +++ b/peft/src/peft/tuners/lora/dora.py @@ -0,0 +1,203 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy + +import torch +import torch.nn.functional as F +from torch import nn + +from peft.utils.integrations import dequantize_module_weight, gather_params_ctx +from peft.utils.other import transpose + + +class DoraLinearLayer(nn.Module): + def __init__(self, fan_in_fan_out): + super().__init__() + self.fan_in_fan_out = fan_in_fan_out + + def get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor: + # calculate L2 norm of weight matrix, column-wise + weight = transpose(weight, self.fan_in_fan_out) + weight = weight + scaling * lora_weight + weight_norm = torch.linalg.norm(weight, dim=1).to(weight.dtype) + return weight_norm + + def update_layer(self, *, base_layer, lora_A, lora_B, scaling, place_on_cpu=False) -> None: + # temporarily convert fp16 to fp32, as fp16 can cause trouble on CPU with PyTorch < 2.2 + dtype_is_fp16 = lora_A.dtype == torch.float16 + if dtype_is_fp16: + lora_A = lora_A.float() + lora_B = lora_B.float() + + with gather_params_ctx(base_layer.parameters()): + if base_layer.__class__.__name__ == "Linear4bit": + # We have to create a copy of the base layer, otherwise, FSDP will throw an error. 8bit does not work + # yet because Int8Params cannot be correctly deep-copied (attributes vanish) + base_layer = deepcopy(base_layer) + + weight = dequantize_module_weight(base_layer) + if weight.data.ndim >= 3: # For handling LoRAs applied to Conv layers. + r = lora_A.shape[0] + lora_weight = torch.mm(lora_B.view([-1, r]), lora_A.view([r, -1])) + lora_weight = lora_weight.reshape(weight.shape) + else: + lora_weight = lora_B @ lora_A + + if dtype_is_fp16: + lora_weight = lora_weight.half() + weight_norm = self.get_weight_norm(weight.to(lora_A.device), lora_weight, scaling) + + if place_on_cpu: + weight_norm = weight_norm.to("cpu") + self.weight = nn.Parameter(weight_norm, requires_grad=True) + + def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_result=None): + """ + For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer + output. + """ + # Don't use `lora_weight = lora_B.weight @ lora_A.weight` because this causes errors with FSDP. Instead, + # calculate the same but using forward. + x_eye = torch.eye(lora_A.weight.shape[1], device=lora_A.weight.device, dtype=x.dtype) + lora_weight = lora_B(lora_A(x_eye)).T + + magnitude = self.weight + weight = dequantize_module_weight(base_layer) + weight = weight.to(x.dtype) + weight_norm = self.get_weight_norm(weight, lora_weight.detach(), scaling) + # see section 4.3 of DoRA (https://huggingface.co/papers/2402.09353) + # "[...] we suggest treating ||V +∆V ||_c in + # Eq. (5) as a constant, thereby detaching it from the gradient + # graph. This means that while ||V + ∆V ||_c dynamically + # reflects the updates of ∆V , it won’t receive any gradient + # during backpropagation" + weight_norm = weight_norm.detach() + mag_norm_scale = (magnitude / weight_norm).view(1, -1) + + lora_result = lora_B(lora_A(x)) + + bias = None + if base_result is not None: + bias = base_layer.bias + if bias is not None: + base_result = base_result - bias + else: + base_result = F.linear(x, transpose(weight, self.fan_in_fan_out)) + + result_dora = (mag_norm_scale - 1) * base_result + mag_norm_scale * lora_result * scaling + + return result_dora + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora.dora." + rep + + +class DoraEmbeddingLayer(DoraLinearLayer): + def forward(self, x, *, lora_A, lora_B, scaling, base_layer, embed_fn): + """ + For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer + output. + """ + lora_weight = (lora_A @ lora_B).T + magnitude = self.weight + weight = base_layer.weight + weight_norm = self.get_weight_norm(weight, lora_weight.detach(), scaling) + # see section 4.3 of DoRA (https://huggingface.co/papers/2402.09353) + # "[...] we suggest treating ||V +∆V ||_c in + # Eq. (5) as a constant, thereby detaching it from the gradient + # graph. This means that while ||V + ∆V ||_c dynamically + # reflects the updates of ∆V , it won’t receive any gradient + # during backpropagation" + weight_norm = weight_norm.detach() + mag_norm_scale = magnitude / weight_norm + result_dora = mag_norm_scale * (embed_fn(x, lora_A) @ lora_B) * scaling + return mag_norm_scale, result_dora + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora.dora." + rep + + +class _DoraConvNdLayer(DoraLinearLayer): + def get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor: + # calculate L2 norm of weight matrix, column-wise + weight = weight + scaling * lora_weight + # the following is needed to have compatibility with the 4/5D weight tensors of Conv2D/3D + dim = tuple(range(1, weight.dim())) + weight_norm = weight.norm(p=2, dim=dim, keepdim=True).transpose(1, 0) + return weight_norm + + def forward(self, x, *, lora_A, lora_B, scaling, base_layer, base_result=None): + """ + For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer + output. + """ + weight = base_layer.weight + r = lora_A.weight.shape[0] + lora_weight = torch.mm(lora_B.weight.view([-1, r]), lora_A.weight.view([r, -1])) + lora_weight = lora_weight.reshape(weight.shape) + magnitude = self.weight + weight_norm = self.get_weight_norm(weight, lora_weight.detach(), scaling) + # see section 4.3 of DoRA (https://huggingface.co/papers/2402.09353) + # "[...] we suggest treating ||V +∆V ||_c in + # Eq. (5) as a constant, thereby detaching it from the gradient + # graph. This means that while ||V + ∆V ||_c dynamically + # reflects the updates of ∆V , it won’t receive any gradient + # during backpropagation" + weight_norm = weight_norm.detach() + mag_norm_scale = magnitude / weight_norm + + if base_result is None: + base_result = self.conv_fn( + x, + weight, + bias=None, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + else: + bias = base_layer.bias + if bias is not None: + # reshape bias to (1, -1, 1, ...) + bias_shape = (1, -1) + (1,) * (base_result.dim() - 2) + base_result = base_result - bias.view(*bias_shape) + + result_dora = (mag_norm_scale - 1) * base_result + mag_norm_scale * lora_B(lora_A(x)) * scaling + return result_dora + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora.dora." + rep + + +class DoraConv1dLayer(_DoraConvNdLayer): + def __init__(self, fan_in_fan_out): + super().__init__(fan_in_fan_out) + self.conv_fn = F.conv1d + + +class DoraConv2dLayer(_DoraConvNdLayer): + def __init__(self, fan_in_fan_out): + super().__init__(fan_in_fan_out) + self.conv_fn = F.conv2d + + +class DoraConv3dLayer(_DoraConvNdLayer): + def __init__(self, fan_in_fan_out): + super().__init__(fan_in_fan_out) + self.conv_fn = F.conv3d diff --git a/peft/src/peft/tuners/lora/eetq.py b/peft/src/peft/tuners/lora/eetq.py new file mode 100644 index 0000000000000000000000000000000000000000..b864d9fba2481bd00b238a099689cd6fe9cea57a --- /dev/null +++ b/peft/src/peft/tuners/lora/eetq.py @@ -0,0 +1,118 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from peft.import_utils import is_eetq_available +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +if is_eetq_available(): + from eetq import EetqLinear + + class EetqLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ): + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + super().__init__() + LoraLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + + def forward(self, x: torch.Tensor): + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result = result + output + return result + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + raise AttributeError("Merging LoRA layers is not supported for Eetq layers.") + + def unmerge(self) -> None: + raise AttributeError("Unmerging LoRA layers is not supported for Eetq layers.") + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_eetq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_eetq_available() and isinstance(target_base_layer, EetqLinear): + new_module = EetqLoraLinear(target, adapter_name, **kwargs) + target.weight = target_base_layer.weight + + if hasattr(target, "bias"): + target.bias = target_base_layer.bias + + return new_module diff --git a/peft/src/peft/tuners/lora/eva.py b/peft/src/peft/tuners/lora/eva.py new file mode 100644 index 0000000000000000000000000000000000000000..1bc75453b1f35116ca307a3ea71c59d7d0efbf66 --- /dev/null +++ b/peft/src/peft/tuners/lora/eva.py @@ -0,0 +1,739 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from collections import Counter, defaultdict +from collections.abc import Iterable, Mapping +from contextlib import nullcontext +from copy import deepcopy +from functools import partial +from itertools import cycle +from typing import Optional, Union + +import torch +import torch.distributed as dist +from tqdm import tqdm +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import _find_minimal_target_modules, check_target_module_exists +from peft.utils.constants import MIN_TARGET_MODULES_FOR_OPTIMIZATION +from peft.utils.incremental_pca import IncrementalPCA +from peft.utils.other import _get_submodules, get_pattern_key + +from .config import LoraConfig +from .layer import Embedding, LoraLayer, MultiheadAttention, _ConvNd + + +UNSUPPORTED_LORA_MODULES = (Embedding, MultiheadAttention, _ConvNd) + + +class _Hook: + """ + A base class for hooks that prepares layer inputs for EVA. + """ + + def __init__( + self, + name: str, + prepare_layer_inputs_fn: Optional[callable] = None, + gather_distributed_inputs: bool = True, + ): + self.name = name + self.gather_distributed_inputs = gather_distributed_inputs + if prepare_layer_inputs_fn is None: + self._prepare_layer_inputs_fn = self._prepare_layer_inputs_fn_default + else: + self._prepare_layer_inputs_fn = prepare_layer_inputs_fn + self.model_input = None + + @staticmethod + def _prepare_layer_inputs_fn_default(layer_input, model_input, layer_name) -> torch.Tensor: + if isinstance(layer_input, torch.Tensor): + pass + elif isinstance(layer_input, (tuple, list)): + layer_input = layer_input[0] + else: + raise ValueError( + f"unsupported input type {type(layer_input)} for prepare_layer_inputs_fn in layer {layer_name}, " + "please provide a custom prepare_layer_inputs_fn" + ) + # if the input has more than 2 dimensions, we flatten all but the last dimension + if layer_input.ndim > 2: + layer_input = layer_input.view(-1, layer_input.size(-1)) + return layer_input + + @torch.no_grad() + def prepare_layer_inputs(self, layer_input): + return self._prepare_layer_inputs_fn(layer_input, self.model_input, self.name) + + def gather_layer_inputs(self, layer_input): + if dist.is_initialized() and self.gather_distributed_inputs: + world_size = dist.get_world_size() + + # First gather sizes from all processes more efficiently + local_size = torch.tensor([layer_input.shape[0]], device=layer_input.device) + all_sizes = torch.empty(world_size, dtype=local_size.dtype, device=layer_input.device) + dist.all_gather_into_tensor(all_sizes, local_size) + all_sizes = all_sizes.tolist() + + # Find maximum size and pad tensors + padded_input = layer_input.new_zeros((max(all_sizes), *layer_input.shape[1:])) + padded_input[: layer_input.shape[0]] = layer_input + + # Gather padded tensors + gathered_inputs = [torch.zeros_like(padded_input) for _ in range(world_size)] + dist.all_gather(gathered_inputs, padded_input.contiguous()) + + # Remove padding for each gathered tensor + gathered_inputs = [tensor[:size] for tensor, size in zip(gathered_inputs, all_sizes)] + + # Concatenate along batch dimension + return torch.cat(gathered_inputs, dim=0) + return layer_input + + +class SVDHook(_Hook): + """ + A forward hook for calculating incremental SVD on layer inputs. The hook is designed to be registered to a PyTorch + module using the `register_forward_hook` method. + + This hook performs a step of incremental Singular Value Decomposition (SVD) on the inputs of a specified layer + during the forward pass of a neural network. The hook also tracks convergence of the computed components using + cosine similarity between the current and previous components. + + Args: + name (str): Name of the layer to which this hook is attached. + n_components (int): Number of principal components to compute. + sim_thresh (Union[float, torch.Tensor]): Similarity threshold for convergence. + prepare_layer_inputs_fn (Optional[callable]): Function to prepare layer inputs for SVD. + """ + + def __init__( + self, + n_components: int, + sim_thresh: Union[float, torch.Tensor], + **base_class_kwargs, + ): + super().__init__(**base_class_kwargs) + self.n_components = n_components + self.sim_thresh = sim_thresh + if isinstance(sim_thresh, torch.Tensor) and len(sim_thresh.shape) > 0: + check1 = sim_thresh.size(0) == n_components or sim_thresh.size(0) == 1 + check2 = len(sim_thresh.shape) == 1 + if not (check1 and check2): + raise ValueError( + "if sim_thresh is a tensor with more than 0 dimensions it must have shape (n_components,) or (1,)" + ) + self.svd = IncrementalPCA( + n_components=n_components, + copy=True, + lowrank=True, + lowrank_seed=42, + ) + self.model_input = None + self.converged = torch.zeros((n_components,), dtype=torch.bool) + + @torch.no_grad() + def __call__(self, model, input, output): + previous_components = None + if hasattr(self.svd, "components_"): + previous_components = self.svd.components_.clone().detach() + states = self.prepare_layer_inputs(input) + states = self.gather_layer_inputs(states) + # check if batch sizes is more than the number of components + if states.size(0) < self.n_components: + print(f"skipping SVD for {self.name} because there are less than {self.n_components} examples") + return + self.svd.partial_fit(states.to(torch.float32)) + # add if statement to check if we are in the first step where previous_components is None + if previous_components is None: + return + components = self.svd.components_ + if len(components.shape) == 1: + components = components.reshape(1, -1) + previous_components = previous_components.reshape(1, -1) + # consider as converged if enough components have converged via cossim + sim = torch.nn.functional.cosine_similarity(components, previous_components) + self.converged = sim >= self.sim_thresh + + +# This is used to determine if inputs of two different layers are equal. For such cases, SVD +# needs to be done for only for one of the equal inputs. +class HashHook(_Hook): + """ + A forward hook for hashing layer inputs. The hook is designed to be registered to a PyTorch module using the + `register_forward_hook` method. + + This hook hashes the inputs of a specified layer during the forward pass of a neural network and stores the hash + values for later analysis or comparison. + + Args: + name (str): Name of the layer to which this hook is attached. hashed_inputs (list): List of hashed inputs. + prepare_layer_inputs_fn (Optional[callable]): Function to prepare layer inputs for hashing. + """ + + def __init__(self, **base_class_kwargs): + super().__init__(**base_class_kwargs) + self.hashed_inputs = [] + + @staticmethod + def hash_fn(tensor): + return hash(tuple(tensor.view(-1).tolist())) + + @torch.no_grad() + def __call__(self, model, input, output): + x = self.prepare_layer_inputs(input) + x = self.gather_layer_inputs(x) + self.hashed_inputs.append(self.hash_fn(x.cpu())) + + +def find_equal_values(dictionary: dict) -> dict: + """ + Find keys in a dictionary that have the same value. + + This function takes a dictionary and returns a new dictionary containing keys that have the same value. The keys in + the output dictionary are the values from the input dictionary, and the values are lists of keys that share the + same value. + """ + value_dict = defaultdict(list) + for k, v in dictionary.items(): + value_dict[v].append(k) + return {k: v for k, v in value_dict.items() if len(v) > 1} + + +def get_device_with_meta_params(model: torch.nn.Module) -> torch.device: + """ + Get the device of the model's parameters. Useful if some parameters are on meta device. + """ + devices = list({p.device for p in model.parameters() if p.device.type != "meta"}) + if len(devices) > 1: + warnings.warn(f"Could not determine device, model has multiple devices: {devices}") + return + return devices[0] + + +def move_inputs_to_device(inputs, device: Union[str, torch.device]): + """ + Move the inputs to the specified device. Adapted from hf.Trainer. + """ + if hasattr(inputs, "to"): + return inputs.to(device) + if isinstance(inputs, Mapping): + return type(inputs)({k: move_inputs_to_device(v, device) for k, v in inputs.items()}) + elif isinstance(inputs, (tuple, list)): + return type(inputs)(move_inputs_to_device(v, device) for v in inputs) + else: + warnings.warn(f"input of type {type(inputs)} could not be moved to the correct device") + return inputs + + +def prepare_model_inputs_fn_language_modeling(model_input, peft_config: LoraConfig): + """ + Get the indices of the items that should be used for SVD. + + Attributes: + model_input (dict): The model inputs. + peft_config (LoraConfig): The configuration for the LoRA layers. + """ + if not isinstance(model_input, dict): + raise ValueError("When using `prepare_model_inputs_fn_language_modeling` inputs must be a dictionary") + mask = model_input.get("attention_mask", torch.ones_like(model_input["input_ids"])).bool() + if peft_config.eva_config.use_label_mask and hasattr(model_input, "labels"): + mask = torch.logical_and(mask, model_input["labels"] != peft_config.eva_config.label_mask_value) + return mask.nonzero() + + +def prepare_layer_inputs_fn_language_modeling(layer_input, model_input, layer_name) -> torch.Tensor: + """ + if not all items in the input should be used for SVD, this function can be used to get the indices of the items + that should be used. + + Attributes: + layer_input (torch.Tensor): The layer inputs. + model_input (torch.Tensor): + The model inputs or if `prepare_model_inputs_fn` is not None the output of this function. + layer_name (str): The name of the layer. + + Returns: + torch.Tensor: The input to the SVD. + """ + # if layer inputs are not a tensor, we simply get the first item + if isinstance(layer_input, torch.Tensor): + pass + elif isinstance(layer_input, (tuple, list)): + layer_input = layer_input[0] + else: + raise ValueError( + f"unsupported input type {type(layer_input)} for prepare_layer_inputs_fn in layer {layer_name}, " + "please provide a custom prepare_layer_inputs_fn" + ) + # in this case model_input is the output of `prepare_model_inputs_fn_language_modeling` + return layer_input[model_input.T.unbind()] + + +def forward_fn_dict(model, inputs): + return model(**inputs) + + +def _get_eva_state_dict( + model: torch.nn.Module, + dataloader: Iterable, + peft_config: Optional[LoraConfig], + target_module_check_fn: callable, + forward_fn: Optional[callable], + prepare_model_inputs_fn: Optional[callable], + prepare_layer_inputs_fn: Union[callable, dict[str, callable], None], + gather_distributed_inputs: bool, + show_progress_bar: bool, +) -> dict: + # Computes the rank distribution for each layer based on the explained variance ratio. + # when rank_pattern flag is False, all values in max_components are the same + def _get_rank_distribution(hooks, layer_hook_map, equal_inputs_map, rank_budget, max_components): + exp_vars = {k: h[0].svd.explained_variance_ratio_[: max_components[k]] for k, h in hooks.items()} + keys, values = zip(*[(k, c) for k, name in layer_hook_map.items() for c in exp_vars[name]]) + idx = torch.stack(values).argsort(descending=True) + counts = Counter([keys[i] for i in idx[:rank_budget]]) + counts = {k: counts.get(k, 0) for k in layer_hook_map.keys()} # add layers with 0 rank + for k, k_hook in equal_inputs_map.items(): + # ensure hook layers have the highest rank if they are equal to another layer + rank, rank_hook = counts[k], counts[k_hook] + if rank_hook >= rank: + continue + counts[k_hook], counts[k] = rank, rank_hook + return counts + + # dataloader is not empty + if len(dataloader) == 0: + raise ValueError("dataloader is empty") + + # check if dist is initialized + if dist.is_initialized() and gather_distributed_inputs: + warnings.warn( + "torch.distributed is initialized and `gather_distributed_inputs` is True, " + "therefore EVA initialization will gather tensors from all ranks. " + "Ensure the model does not receive the same inputs on different ranks." + ) + + # for unusually high rho values, define an upper limit + rho_threshold = 1000 + rho = peft_config.eva_config.rho + if rho > rho_threshold: + max_dim = max(max(p.shape) for p in model.parameters()) + rho_ceil = max_dim // peft_config.r + rho = min(rho, rho_ceil) + + training = model.training + device = get_device_with_meta_params(model) + model.eval() + + # get model inputs + inputs = next(iter(dataloader)) + if device is not None: + inputs = move_inputs_to_device(inputs, device) + if prepare_model_inputs_fn is not None: + model_inputs_for_hooks = prepare_model_inputs_fn(inputs, peft_config) + else: + model_inputs_for_hooks = deepcopy(inputs) + + hooks = {} + max_components = {} + rank_budget = 0 + for name, module in model.named_modules(): + if not target_module_check_fn(name, module): + continue + if isinstance(prepare_layer_inputs_fn, Mapping): + fn = prepare_layer_inputs_fn.pop(name, None) + else: + fn = prepare_layer_inputs_fn + hook = HashHook(name=name, prepare_layer_inputs_fn=fn, gather_distributed_inputs=gather_distributed_inputs) + hook.model_input = model_inputs_for_hooks + handle = module.register_forward_hook(hook) + hooks[name] = (hook, handle) + layer_rank = peft_config.rank_pattern.get( + get_pattern_key(peft_config.rank_pattern.keys(), name), peft_config.r + ) + max_components[name] = round(layer_rank * rho) + rank_budget += layer_rank + if isinstance(prepare_layer_inputs_fn, Mapping) and len(prepare_layer_inputs_fn) > 0: + raise ValueError( + "prepare_layer_inputs_fn is a mapping but the following module names were not found in the model: " + f"{prepare_layer_inputs_fn.keys()}" + ) + + # forward for one batch to check which layer inputs are equal to avoid unneeded svd calculations + forward_fn(model, inputs) + hash_dict = {k: h[0].hashed_inputs[0] for k, h in hooks.items()} + # equal input maps groups layers which receive the same input. One layer is defined as the key and receives an svd + # hook. For the remaining layers the svd results can be skipped. + equal_inputs = list(find_equal_values(hash_dict).values()) + equal_inputs_map = {vv: v[0] for v in equal_inputs for vv in v[1:]} + # for layers with equal inputs we need to make sure that the max_components are the same + for names in equal_inputs: + max_value = max(max_components[n] for n in names) + for n in names: + max_components[n] = max_value + + # initialize svd hooks + for name in list(hooks.keys()): + hook, handle = hooks.pop(name) + handle.remove() + if name in equal_inputs_map: + continue + hook = SVDHook( + n_components=max_components[name], + sim_thresh=peft_config.eva_config.tau, + name=name, + prepare_layer_inputs_fn=hook._prepare_layer_inputs_fn, + gather_distributed_inputs=gather_distributed_inputs, + ) + module = model.get_submodule(name) + handle = module.register_forward_hook(hook) + hooks[name] = (hook, handle) # adding the old handle here so we dont get errors in the first forward pass + layer_hook_map = {**dict(zip(hooks.keys(), hooks.keys())), **equal_inputs_map} + + # start svd calculation + if show_progress_bar and (not dist.is_initialized() or dist.get_rank() == 0): + pbar = tqdm(iter(cycle(dataloader)), position=0, leave=False) + use_tqdm = True + else: + pbar = iter(cycle(dataloader)) + use_tqdm = False + convergence_dict = {k: False for k in hooks.keys()} + rank_dist = max_components.copy() + for inputs in pbar: + if device is not None: + inputs = move_inputs_to_device(inputs, device) + if prepare_model_inputs_fn is not None: + model_inputs_for_hooks = prepare_model_inputs_fn(inputs, peft_config) + else: + model_inputs_for_hooks = deepcopy(inputs) + + for name in list(hooks.keys()): + hook, handle = hooks[name] + # check if all components that are needed for the rank distribution have converged + converged = torch.all(hook.converged[: rank_dist[name]]) + # if a layer has switched from not converged to converged in the current step + if (not convergence_dict[name]) and converged and handle: + handle.remove() + handle = None + convergence_dict[name] = True + continue + # if a layer has switched from converged to not converged in the current step + elif convergence_dict[name] and not converged: + module = model.get_submodule(name) + handle = module.register_forward_hook(hook) + convergence_dict[name] = False + hook.model_input = model_inputs_for_hooks + hooks[name] = (hook, handle) + + if use_tqdm: + layer_converged = list(convergence_dict.values()) + [ + convergence_dict[v] for v in equal_inputs_map.values() + ] + pbar.set_description(f"{sum(layer_converged)}/{len(layer_converged)} layers have converged") + + if all(convergence_dict.values()): + break + + forward_fn(model, inputs) + + # in case some hooks have to skip the svd calculation because the number of tokens is less than the number of + # components + if not all(hasattr(h[0].svd, "components_") for h in hooks.values()): + continue + + rank_dist = _get_rank_distribution(hooks, layer_hook_map, equal_inputs_map, rank_budget, max_components) + + # check all custom hooks have been removed + remaining_hooks = {n for n, m in model.named_modules() for v in m._forward_hooks.values() if isinstance(v, _Hook)} + if len(remaining_hooks) > 0: + raise ValueError( + f"Found active hooks added by EVA that weren't properly removed: {remaining_hooks}. " + "Please report this issue at https://github.com/huggingface/peft/issues" + ) + + eva_state_dict = {} + for name, rank in rank_dist.items(): + hook = hooks[layer_hook_map[name]][0] + if not torch.all(hook.converged[:rank]): + raise ValueError( + f"Layer {name} has not converged but was assigned rank {rank}. " + "Please report this issue at https://github.com/huggingface/peft/issues" + ) + u = hook.svd.components_[:rank] + if peft_config.eva_config.whiten: + u /= hook.svd.singular_values_[:rank].sqrt().reshape(-1, 1) + eva_state_dict[name] = u + + # restore model state + model.train(training) + + # move tensors to device + if device is not None: + eva_state_dict = {k: v.to(device) for k, v in eva_state_dict.items()} + + return eva_state_dict + + +def _load_eva_state_dict( + model: torch.nn.Module, + eva_state_dict: dict, + adapter_name: str, +): + peft_config = model.peft_config[adapter_name] + update_layer_kwargs = { + "adapter_name": adapter_name, + "lora_dropout": peft_config.lora_dropout, + "use_rslora": peft_config.use_rslora, + "use_dora": peft_config.use_dora, + "lora_bias": peft_config.lora_bias, + } + missing_eva_inits = [] + new_target_modules = [] + other_module_names = [] + rank_pattern = {} + alpha_pattern = {} + for name, module in model.named_modules(): + name_in_base_model = name.replace("base_model.model.", "") + if not isinstance(module, LoraLayer): + other_module_names.append(name_in_base_model) + continue + # Regexp matching - Find key which matches current target_name in patterns provided + r = peft_config.rank_pattern.get(get_pattern_key(peft_config.rank_pattern.keys(), name), peft_config.r) + alpha = peft_config.alpha_pattern.get( + get_pattern_key(peft_config.alpha_pattern.keys(), name), peft_config.lora_alpha + ) + if name in eva_state_dict: + w = eva_state_dict.pop(name) + new_rank = w.size(0) + if new_rank == 0: + parent, _, target_name = _get_submodules(model, name) + setattr(parent, target_name, module.get_base_layer()) + continue + elif new_rank != r: + if peft_config.eva_config.adjust_scaling_factors: + alpha *= new_rank / r + if new_rank != r or module.lora_A[adapter_name].weight.device.type == "meta": + module.update_layer(r=new_rank, lora_alpha=alpha, init_lora_weights="eva", **update_layer_kwargs) + module.lora_A[adapter_name].weight.copy_(w) + new_target_modules.append(name_in_base_model) + else: + module.update_layer(r=r, lora_alpha=alpha, init_lora_weights=True, **update_layer_kwargs) + missing_eva_inits.append(name_in_base_model) + new_rank = r + # update rank pattern and alpha pattern + if new_rank != peft_config.r: + rank_pattern[name_in_base_model] = new_rank + if alpha != peft_config.lora_alpha: + alpha_pattern[name_in_base_model] = alpha + + # update target modules if some lora layers have been removed due to their EVA rank being 0 + new_target_modules = new_target_modules + missing_eva_inits + if len(new_target_modules) >= MIN_TARGET_MODULES_FOR_OPTIMIZATION: + new_target_modules = _find_minimal_target_modules(new_target_modules, other_module_names) + model.peft_config[adapter_name].target_modules = new_target_modules + + # set rank pattern obtained from EVA + model.peft_config[adapter_name].rank_pattern = rank_pattern + + # when adjust_scaling_factors is True, lora scaling factors have been adjusted after the rank redistribution + model.peft_config[adapter_name].alpha_pattern = alpha_pattern + + if missing_eva_inits: + warnings.warn( + "the following layers were initialized with init_lora_weights=True because they " + f"were not found in the eva state_dict: {missing_eva_inits}\ncurrently the " + f"following lora modules are not supported by EVA: {UNSUPPORTED_LORA_MODULES}" + ) + + +@torch.no_grad() +def get_eva_state_dict( + model: torch.nn.Module, + dataloader: Iterable, + peft_config: Optional[LoraConfig] = None, + forward_fn: Optional[callable] = forward_fn_dict, + prepare_model_inputs_fn: Optional[callable] = prepare_model_inputs_fn_language_modeling, + prepare_layer_inputs_fn: Union[callable, dict[str, callable], None] = prepare_layer_inputs_fn_language_modeling, + adapter_name: str = "default", + gather_distributed_inputs: bool = True, + show_progress_bar: bool = True, +) -> dict: + """ + Compute the SVD for each layer in the model. + + This function computes the Singular Value Decomposition (SVD) for each layer in the model. It uses the incremental + PCA method to compute the SVD components. The function also checks for convergence of the computed components using + cosine similarity. The rank distribution for each layer is determined based on the explained variance ratio. + + Args: + model (torch.nn.Module): The model to compute the SVD for. Does not need to be a PeftModel. + dataloader (Iterable): The dataloader to use for the forward pass. + peft_config (Optional[LoraConfig]): + The configuration for the LoRA layers. Only required if `model` is not a PeftModel. + forward_fn (callable): + The forward function to use for the forward pass. Takes two arguments: `model` and `inputs`. Default + behavior is `return model(**inputs)` + prepare_model_inputs_fn (Optional[callable]): + This function receives the model inputs and the peft_config and passes the output to + `prepare_layer_inputs_fn`. Can be used to modify the input to the SVD computation based on the original + model inputs. For example for language modeling the attention mask is used to determine which indices are + padding tokens and should not be used for SVD. Any function defined here expects two arguments: + `model_input` and `peft_config`. `peft.tuners.lora.eva.prepare_model_inputs_fn_language_modeling` is used + by default. + prepare_layer_inputs_fn (Union[callable, Dict[str, callable], None]): + This function receives the layer inputs, the model inputs (potentially modified by + `prepare_model_inputs_fn`) and the name of the layer and returns the inputs that should be used for SVD for + that particular layer. Any custom function defined here expects three arguments: `layer_input`, + `model_input`, and `layer_name` and should return a 2d tensor. The default logic can be found in + peft.tuners.lora.eva.prepare_layer_inputs_fn_language_modeling and works for language modeling. In this + case model_inputs is the mask used to determine which indices should be used for SVD (created by + `prepare_model_inputs_fn_language_modeling`). + adapter_name (str): The name of the adapter to compute the SVD for. + gather_distributed_inputs (bool): + Whether to gather the layer inputs from all ranks. Default is True meaning in a distributed setting the + layer inputs will be gathered from all ranks for the SVD computation. For non-distributed settings this + argument is ignored. Set to False if you are using a non-distributed dataloader in a distributed setting. + show_progress_bar (bool): Whether to show a progress bar. Default is True. + + Returns: + eva_state_dict (dict): The state dictionary containing the SVD components for each layer. + """ + + def target_module_check_fn_peft_model(name, module, unsupported_lora_modules): + "check if a module is an adapter module via base_layer attribute" + return hasattr(module, "base_layer") and not isinstance(module, unsupported_lora_modules) + + def target_module_check_fn_default(name, module, peft_config): + "check if a module is an adapter module via target_modules" + is_target_module = True + if peft_config.target_modules is not None: + is_target_module = check_target_module_exists(peft_config, name) + # Conv1D for GPT2 support + return isinstance(module, (torch.nn.Linear, Conv1D)) and is_target_module + + is_peft_model = hasattr(model, "peft_config") + + # get peft_config + if is_peft_model and peft_config is None: + peft_config = model.peft_config[adapter_name] + elif peft_config is None: + raise ValueError("peft_config is required if model is not a PeftModel") + + # setup context and target module check function + if is_peft_model: + ctx = model.disable_adapter() + target_module_check_fn = partial( + target_module_check_fn_peft_model, unsupported_lora_modules=UNSUPPORTED_LORA_MODULES + ) + else: + ctx = nullcontext() + target_module_check_fn = partial(target_module_check_fn_default, peft_config=peft_config) + + with ctx: + eva_state_dict = _get_eva_state_dict( + model=model, + dataloader=dataloader, + peft_config=peft_config, + target_module_check_fn=target_module_check_fn, + forward_fn=forward_fn, + prepare_model_inputs_fn=prepare_model_inputs_fn, + prepare_layer_inputs_fn=prepare_layer_inputs_fn, + gather_distributed_inputs=gather_distributed_inputs, + show_progress_bar=show_progress_bar, + ) + return eva_state_dict + + +@torch.no_grad() +def initialize_lora_eva_weights( + model: torch.nn.Module, + dataloader: Optional[Iterable] = None, + eva_state_dict: Optional[dict] = None, + forward_fn: Optional[callable] = forward_fn_dict, + prepare_model_inputs_fn: Optional[callable] = prepare_model_inputs_fn_language_modeling, + prepare_layer_inputs_fn: Union[callable, dict[str, callable], None] = prepare_layer_inputs_fn_language_modeling, + adapter_name: str = "default", + gather_distributed_inputs: bool = True, + show_progress_bar: bool = True, +): + """ + Initialize the weights of the LoRA layers using the EVA method. + + This function initializes the weights of the LoRA layers using the EVA method. It computes the SVD for each adapter + layer and updates the weights accordingly. + + Args: + model (PeftModel): The peft model to compute the SVD for. + dataloader (Optional[Iterable]): + The dataloader to use for the forward pass. If None, eva_state_dict needs to be provided. + eva_state_dict (Optional[dict]): + The state_dict to load into the model. If None, a dataloader needs to be provided and the state_dict will + be computed using `get_eva_state_dict`. + forward_fn (callable): + The forward function to use for the forward pass. Takes two arguments: `model` and `inputs`. Default + behavior is `return model(**inputs)` + prepare_model_inputs_fn (Optional[callable]): + This function receives the model inputs and the peft_config and passes the output to + `prepare_layer_inputs_fn`. Can be used to modify the input to the SVD computation based on the original + model inputs. For example for language modeling the attention mask is used to determine which indices are + padding tokens and should not be used for SVD. Any function defined here expects two arguments: + `model_input` and `peft_config`. `peft.tuners.lora.eva.prepare_model_inputs_fn_language_modeling` is used + by default. + prepare_layer_inputs_fn (Union[callable, Dict[str, callable], None]): + This function receives the layer inputs, the model inputs (potentially modified by + `prepare_model_inputs_fn`) and the name of the layer and returns the inputs that should be used for SVD for + that particular layer. Any custom function defined here expects three arguments: `layer_input`, + `model_input`, and `layer_name` and should return a 2d tensor. The default logic can be found in + peft.tuners.lora.eva.prepare_layer_inputs_fn_language_modeling and works for language modeling. In this + case model_inputs is the mask used to determine which indices should be used for SVD (created by + `prepare_model_inputs_fn_language_modeling`). + adapter_name (str): The name of the adapter to initialize the weights for. + gather_distributed_inputs (bool): + Whether to gather the layer inputs from all ranks. Default is True meaning in a distributed setting the + layer inputs will be gathered from all ranks for the SVD computation. For non-distributed settings this + argument is ignored. Set to False if you are using a non-distributed dataloader in a distributed setting. + show_progress_bar (bool): Whether to show a progress bar. Default is True. + + Returns: + model (torch.nn.Module): The model with the initialized LoRA weights. + """ + if not hasattr(model, "peft_config"): + raise ValueError("model must be a PeftModel") + + # eva currently only works with a single active adapter + # Important: when removing this requirement, make sure eva init works correctly if the new rank is 0. + if len(model.active_adapters) > 1: + raise ValueError("`initialize_lora_eva_weights` currently only works with a single active adapter") + + # initialize_lora_eva_weights only works with `init_lora_weights='eva'` + if model.peft_config[adapter_name].init_lora_weights != "eva": + raise ValueError("`initialize_lora_eva_weights` can only be used with `init_lora_weights='eva'`") + + # compute svd + if eva_state_dict is None: + if dataloader is None: + raise ValueError("dataloader is required if eva_state_dict is not provided") + eva_state_dict = get_eva_state_dict( + model=model, + dataloader=dataloader, + forward_fn=forward_fn, + prepare_model_inputs_fn=prepare_model_inputs_fn, + prepare_layer_inputs_fn=prepare_layer_inputs_fn, + adapter_name=adapter_name, + gather_distributed_inputs=gather_distributed_inputs, + show_progress_bar=show_progress_bar, + ) + + _load_eva_state_dict(model, eva_state_dict, adapter_name) diff --git a/peft/src/peft/tuners/lora/gptq.py b/peft/src/peft/tuners/lora/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..7ff40bc9efd61f681b50db7aeed09e64815ea60a --- /dev/null +++ b/peft/src/peft/tuners/lora/gptq.py @@ -0,0 +1,154 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from peft.import_utils import is_gptqmodel_available +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import get_auto_gptq_quant_linear + +from .layer import LoraVariant + + +class GPTQLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + use_qalora: bool = False, + lora_bias: bool = False, + qalora_group_size: int = 32, + **kwargs, + ): + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_qalora=use_qalora, + lora_bias=lora_bias, + qalora_group_size=qalora_group_size, + ) + + def resolve_lora_variant(self, *, use_dora: bool, use_qalora: bool, **kwargs) -> Optional[LoraVariant]: + if use_dora and use_qalora: + raise NotImplementedError( + f"Dora and QA_lora at the same time is not supported for {self.__class__.__name__} (yet)." + ) + elif use_dora: + from .variants import DoraLinearVariant + + variant = DoraLinearVariant() + elif use_qalora: + from .variants import QALoraLinearVariant + + variant = QALoraLinearVariant() + else: + variant = None + return variant + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + lora_A_keys = self.lora_A.keys() + + for active_adapter in self.active_adapters: + if active_adapter not in lora_A_keys: + continue + torch_result_dtype = result.dtype + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + if active_adapter not in self.lora_variant: # vanilla LoRA + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + ) + + result = result.to(torch_result_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 + # def reset_lora_parameters(self, adapter_name): + # if adapter_name in self.lora_A.keys(): + # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) + # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) + + +def dispatch_gptq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + cfg = kwargs.get("gptq_quantization_config", None) + + if is_gptqmodel_available(): + from gptqmodel.nn_modules.qlinear import BaseQuantLinear + + if isinstance(target_base_layer, BaseQuantLinear): + new_module = GPTQLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + else: + quant_linear = get_auto_gptq_quant_linear(cfg) + + if quant_linear is not None and isinstance(target_base_layer, quant_linear): + new_module = GPTQLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/peft/src/peft/tuners/lora/hqq.py b/peft/src/peft/tuners/lora/hqq.py new file mode 100644 index 0000000000000000000000000000000000000000..924acb2d4d254e61421de2a1e5efa391458c1ae9 --- /dev/null +++ b/peft/src/peft/tuners/lora/hqq.py @@ -0,0 +1,251 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +import warnings +from typing import Any, Optional + +import torch + +from peft.import_utils import is_hqq_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose + +from .layer import LoraLayer, LoraVariant + + +if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + class HqqLoraLinear(torch.nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ) -> None: + if lora_bias: + raise ValueError(f"{self.__class__.__name__} does not support lora_bias yet, set it to False") + + super().__init__() + LoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: + return None + + from .variants import DoraLinearVariant + + return DoraLinearVariant() + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + + layer = self.get_base_layer() + quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta} + + output = layer.dequantize() + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output + lora_data + else: + w_data = self.lora_variant[active_adapter].merge_safe(self, active_adapter, output) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device) + quant_config.pop("offload_meta", None) + new_hqq_layer.quantize(w_data, **quant_config) + self.base_layer = new_hqq_layer + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + + layer = self.get_base_layer() + quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta} + output = layer.dequantize() + + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_data = self.get_delta_weight(active_adapter) + w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data + else: + w_data = self.lora_variant[active_adapter].unmerge(self, active_adapter, output) + + new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device) + quant_config.pop("offload_meta", None) + new_hqq_layer.quantize(w_data, **quant_config) + self.base_layer = new_hqq_layer + + def get_delta_weight(self, adapter): + return ( + transpose( + self.lora_B[adapter].weight @ self.lora_A[adapter].weight, + False, + ) + * self.scaling[adapter] + ) + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + result = self.base_layer(x, *args, **kwargs) + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]] + output = lora_B(lora_A(dropout(sub_batch))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] += output + + return result + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + if active_adapter not in self.lora_variant: # vanilla LoRA + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + ) + + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_hqq(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_hqq_available() and isinstance(target_base_layer, HQQLinear): + new_module = HqqLoraLinear(target_base_layer, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/lora/inc.py b/peft/src/peft/tuners/lora/inc.py new file mode 100644 index 0000000000000000000000000000000000000000..e9fea9dc266a72cdf1368963df5b039691c22bdc --- /dev/null +++ b/peft/src/peft/tuners/lora/inc.py @@ -0,0 +1,78 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: PEFT tests related to INC are handled under Optimum-Habana repository: +# - LLMs: https://github.com/huggingface/optimum-habana/blob/main/tests/test_peft_inference.py +# - Diffusers: https://github.com/huggingface/optimum-habana/blob/main/tests/test_diffusers.py + +from typing import Optional + +import torch + +from peft.import_utils import is_inc_available +from peft.tuners.tuners_utils import BaseTunerLayer + +from .layer import Linear + + +if is_inc_available(): + + class IncLoraLinear(Linear): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + **kwargs, + ): + super().__init__(base_layer, adapter_name, **kwargs) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + raise NotImplementedError("Merging LoRA with INC layers is not yet implemented") + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + raise NotImplementedError("Unmerging LoRA from INC layers is not yet implemented") + + +def dispatch_inc(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_inc_available(): + from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import ( + PatchedLinear, + ) + + if isinstance(target_base_layer, PatchedLinear): + new_module = IncLoraLinear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/lora/layer.py b/peft/src/peft/tuners/lora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..b01e87e6a5d2a69e123c6198d83946a21871a9fd --- /dev/null +++ b/peft/src/peft/tuners/lora/layer.py @@ -0,0 +1,2271 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import warnings +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import svd_lowrank +from transformers.pytorch_utils import Conv1D + +from peft.tuners._buffer_dict import BufferDict +from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge +from peft.utils.integrations import ( + dequantize_module_weight, + gather_params_ctx, + get_bnb_param_type, + skip_init_on_device, +) +from peft.utils.other import transpose +from peft.utils.warning import PeftWarning + +from .config import ArrowConfig, LoraConfig + + +VARIANT_KWARG_KEYS = ["alora_offsets"] + + +class LoraVariant: + """ + Base class for LoRA variants, e.g. DoRA. + + This class should be subclassed and the methods below should be implemented accordingly. The methods should be + implemented as static methods, this makes it easier to combine variants. + + Note for developers: These methods are prone to change and should thus considered to be "private". Use at your own + discretion. + """ + + @staticmethod + def init(module: LoraLayer, adapter_name: str) -> None: + """Initialization code for the LoRA variant, it's called within `update_layer`""" + raise NotImplementedError + + @staticmethod + def merge_safe(module: LoraLayer, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + """Safe merging of the weights from `merge(..., safe_merge=True)`, should return a new tensor""" + raise NotImplementedError + + @staticmethod + def merge_unsafe(module: LoraLayer, active_adapter: str, orig_weight: torch.Tensor) -> None: + """Unsafe merging of the weights from `merge(..., safe_merge=False)`, should modify the weight in-place""" + + @staticmethod + def unmerge(module: LoraLayer, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + """Remove the adapter weights from the original weights, then return them""" + + @staticmethod + def forward( + module: LoraLayer, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + """ + The forward pass of the LoRA variant, should return the overall result (not just the diff) + + Args: + module (LoraLayer): The module on which the forward pass is called + active_adapter (str): The name of the active adapter + x (torch.Tensor): The input to the forward call + result (torch.Tensor): The result from the base model + **kwargs: Additional arguments passed from [`LoraLayer.forward`]. + """ + raise NotImplementedError + + +class LoraLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names: tuple[str, ...] = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B") + # All names of other parameters that may contain adapter-related parameters + other_param_names: tuple[str, ...] = ("r", "lora_alpha", "scaling", "lora_dropout") + + def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None: + self.base_layer = base_layer + self.r = {} + self.lora_alpha = {} + self.scaling = {} + self.lora_dropout = nn.ModuleDict({}) + self.lora_A = nn.ModuleDict({}) + self.lora_B = nn.ModuleDict({}) + # For Embedding layer + self.lora_embedding_A = nn.ParameterDict({}) + self.lora_embedding_B = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.use_dora: dict[str, bool] = {} # not actively used anymore after #2443, keep it for BC + self.use_rslora: dict[str, bool] = {} + self.lora_bias: dict[str, bool] = {} + self.lora_magnitude_vector = torch.nn.ModuleDict() # for DoRA + self._caches: dict[str, Any] = {} + self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled: bool = True + self.lora_variant: dict[str, LoraVariant] = {} + self.kwargs = kwargs + + base_layer = self.get_base_layer() + in_features, out_features = _get_in_out_features(base_layer) + self.in_features = in_features + self.out_features = out_features + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + """Return a matching LoRA variant for this layer type. + + Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this + method should return the DoRA variant for the given layer. If `use_alora=True`, same for aLoRA. + + If there is no fitting variant, return None. + + Note: If this layer type does not support the LoRA variant at all, please raise an error during __init__ as is + convention, and not here. + + """ + return None + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora: bool = False, + use_alora: bool = False, + use_qalora: bool = False, + lora_bias: bool = False, + arrow_config: ArrowConfig = None, + qalora_group_size: int = 32, + inference_mode: bool = False, + **kwargs, + ): + # collect the kwargs + kwargs = locals().copy() + del kwargs["self"] + + # This code works for linear layers, override for other layer types + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + if lora_bias and (getattr(self.get_base_layer(), "bias", None) is None): + warnings.warn( + f"`lora_bias=True` was passed but the targeted layer of type {type(self.get_base_layer()).__name__} " + "has no bias. This means that merging LoRA weights won't be possible.", + PeftWarning, + ) + + lora_variant = self.resolve_lora_variant( + use_dora=use_dora, + use_alora=use_alora, + use_qalora=use_qalora, + qalora_group_size=qalora_group_size, + arrow_config=arrow_config, + ) + if lora_variant is not None: + self.lora_variant[adapter_name] = lora_variant + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer})) + + # Actual trainable parameters + self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False) + self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=lora_bias) + self.lora_bias[adapter_name] = lora_bias + + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + self.use_rslora[adapter_name] = use_rslora + + self.use_dora[adapter_name] = use_dora + + # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed + if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"): + with gather_params_ctx(self.get_base_layer().weight): + self.pissa_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.startswith("corda"): + with gather_params_ctx(self.get_base_layer().weight): + self.corda_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": + with gather_params_ctx(self.get_base_layer().weight): + self.olora_init(adapter_name) + elif init_lora_weights == "loftq": + with gather_params_ctx(self.get_base_layer().weight): + self.loftq_init(adapter_name) + elif init_lora_weights == "eva": + nn.init.zeros_(self.lora_B[adapter_name].weight) + elif init_lora_weights == "orthogonal": + with gather_params_ctx(self.get_base_layer().weight): + self.orthogonal_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + # call this before init of the lora variants + self._move_adapter_to_device_of_base_layer(adapter_name) + + if adapter_name in self.lora_variant: + self.lora_variant[adapter_name].init(self, **kwargs) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + # Check for adapters that were added or removed from the arrow_model. + # The arrow model may be modified after creation by adding new experts + # (pre-trained or trainable) or by removing existing ones. Whenever such + # a change occurs, on_adapter_change() is called to update the set of + # active task-specific experts and, if needed, to handle recomputing prototypes + # and doing general knowledge subtraction (GKS) again. + if hasattr(self, "lora_arrow"): + for adapter in self.lora_variant: + if adapter in self.lora_arrow: + self.lora_arrow[adapter].on_adapter_change(self.lora_A, self.lora_B) + + def reset_lora_parameters(self, adapter_name, init_lora_weights): + if init_lora_weights is False: + return + + if adapter_name in self.lora_A.keys(): + if init_lora_weights is True: + # initialize A the same way as the default for nn.Linear and B to zero + # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124 + nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5)) + elif init_lora_weights.lower() == "gaussian": + nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name]) + else: + raise ValueError(f"Unknown initialization {init_lora_weights=}") + nn.init.zeros_(self.lora_B[adapter_name].weight) + if self.lora_bias[adapter_name]: + nn.init.zeros_(self.lora_B[adapter_name].bias) + if adapter_name in self.lora_embedding_A.keys(): + # Initialize A to zeros and B the same way as the default for nn.Embedding, see: + # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L59-L60 + nn.init.zeros_(self.lora_embedding_A[adapter_name]) + nn.init.normal_(self.lora_embedding_B[adapter_name]) + if self.lora_bias[adapter_name]: + # embeddings are not supported at the moment, but still adding this for consistency + nn.init.zeros_(self.lora_embedding_B[adapter_name].bias) + + def olora_init(self, adapter_name): + base_layer = self.get_base_layer() + orig_weight = base_layer.weight + bnb_param_type = get_bnb_param_type(orig_weight) + dtype = orig_weight.dtype + + if bnb_param_type: + # check without importing bitsandbytes and robust to bnb_4bit_quant_storage=float* + weight_tensor = dequantize_module_weight(base_layer) + elif dtype in [torch.float32, torch.float16, torch.bfloat16]: + weight_tensor = orig_weight + else: + raise TypeError(f"Unsupported data type for the base layer. Got {dtype}.") + + scale_factor = self.scaling[adapter_name] + r = self.r[adapter_name] + weight_tensor = weight_tensor.to(torch.float32) + Q, R = torch.linalg.qr(weight_tensor.data) + + Qr, Rr = Q[:, :r], R[:r] + + self.lora_A[adapter_name].weight.data = Rr.contiguous() + self.lora_B[adapter_name].weight.data = Qr.contiguous() + + weight_tensor.data -= scale_factor * self.lora_B[adapter_name].weight @ self.lora_A[adapter_name].weight + if bnb_param_type == "4bit": + weight_tensor = orig_weight.__class__( + weight_tensor, + quant_type=orig_weight.quant_type, + quant_storage=orig_weight.quant_storage, + compress_statistics=orig_weight.compress_statistics, + module=orig_weight.module, + ).to(orig_weight.device) + base_layer.weight = weight_tensor + elif bnb_param_type == "8bit": + weight_tensor = orig_weight.__class__( + weight_tensor, + requires_grad=orig_weight.requires_grad, + has_fp16_weights=orig_weight.has_fp16_weights, + ).to(orig_weight.device) + base_layer.weight = weight_tensor + else: + weight_tensor = weight_tensor.to(dtype) + base_layer.weight.data = weight_tensor + + def pissa_init(self, adapter_name, init_lora_weights): + weight = self.get_base_layer().weight + dtype = weight.dtype + if dtype not in [torch.float32, torch.float16, torch.bfloat16]: + raise TypeError( + "Please initialize PiSSA under float32, float16, or bfloat16. " + "Subsequently, re-quantize the residual model to help minimize quantization errors." + ) + weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) + if init_lora_weights == "pissa": + # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel}, + V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False) + Vr = V[:, : self.r[adapter_name]] + Sr = S[: self.r[adapter_name]] + Sr /= self.scaling[adapter_name] + Uhr = Uh[: self.r[adapter_name]] + elif len(init_lora_weights.split("_niter_")) == 2: + Vr, Sr, Ur = svd_lowrank( + weight.data, self.r[adapter_name], niter=int(init_lora_weights.split("_niter_")[-1]) + ) + Sr /= self.scaling[adapter_name] + Uhr = Ur.t() + else: + raise ValueError( + f"init_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got {init_lora_weights} instead." + ) + + lora_A = torch.diag(torch.sqrt(Sr)) @ Uhr + lora_B = Vr @ torch.diag(torch.sqrt(Sr)) + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A + weight = transpose(weight.to(dtype), self.fan_in_fan_out) + self.get_base_layer().weight.data = weight + + def corda_init(self, adapter_name, init_lora_weights): + linear = self.get_base_layer() + weight = linear.weight + dtype = weight.dtype + if dtype not in [torch.float32, torch.float16, torch.bfloat16]: + raise TypeError( + "Please initialize CorDA under float32, float16, or bfloat16. " + "Subsequently, re-quantize the residual model to help minimize quantization errors." + ) + weight = weight.to(torch.float32) + out_dim = weight.data.size(0) + in_dim = weight.data.size(1) + + # Calculate WC from covariance matrix + if not hasattr(linear, "eigens"): + raise ValueError( + "`eigens` attribute not found for layer, please run `preprocess_corda` first. " + "More information can be found at examples/corda_finetuning/README.md." + ) + eigens = linear.eigens + U = eigens.U_WC + S = eigens.S_WC + V = eigens.V_WC + r = self.r[adapter_name] + + # nan or inf check + if torch.isnan(S).any() or torch.isinf(S).any(): + raise ValueError( + "Invalid value found in matrix S. Please file an issue at https://github.com/huggingface/peft/issues." + ) + if torch.isnan(U).any() or torch.isinf(U).any(): + raise ValueError( + "Invalid value found in matrix U. Please file an issue at https://github.com/huggingface/peft/issues." + ) + if torch.isnan(V).any() or torch.isinf(V).any(): + raise ValueError( + "Invalid value found in matrix V. Please file an issue at https://github.com/huggingface/peft/issues." + ) + + # Sanity check + if U.size(0) != out_dim or U.size(1) != r: + raise ValueError( + f"Matrix U size mismatch: {U.size()} vs. ({out_dim}, {r}). Please make sure the `lora_config` and " + "`model` argument of `preprocess_corda` is consistent with `get_peft_model`. If you're using cache " + "in `preprocess_corda`, please make sure the cache is built with the same model and LoRA rank." + ) + if S.size(0) != r: + raise ValueError( + f"Matrix S size mismatch: {S.size()} vs. ({r},). Please make sure the `lora_config` and `model` argument " + "of `preprocess_corda` is consistent with `get_peft_model`. If you're using cache in `preprocess_corda`, " + "please make sure the cache is built with the same model and LoRA rank." + ) + if V.size(0) != in_dim or V.size(1) != r: + raise ValueError( + f"Matrix V size mismatch: {V.size()} vs. ({in_dim}, {r}). Please make sure the `lora_config` and " + "`model` argument of `preprocess_corda` is consistent with `get_peft_model`. If you're using cache " + "in `preprocess_corda`, please make sure the cache is built with the same model and LoRA rank." + ) + + # Apply alpha + S /= self.scaling[adapter_name] + + # Init lora_A and lora_B weights + lora_A = V.t().mul(S.sqrt().view(-1, 1)).contiguous() + lora_B = U.mul(S.sqrt()).contiguous() + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A + weight = weight.to(dtype) + self.get_base_layer().weight.data = weight + + # Remove redundant fields + del linear.eigens + + def loftq_init(self, adapter_name): + from peft.utils.loftq_utils import loftq_init + + weight = self.get_base_layer().weight + kwargs = { + "num_bits": self.kwargs.get("loftq_bits", 4), + "reduced_rank": self.r[adapter_name], + "num_iter": self.kwargs.get("loftq_iter", 1), + } + + qweight, lora_A, lora_B = loftq_init(weight, **kwargs) + if adapter_name in self.lora_A.keys(): + # initialize A the same way as the default for nn.Linear and B to zero + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + if adapter_name in self.lora_embedding_A.keys(): + # initialize a the same way as the default for nn.linear and b to zero + self.lora_embedding_A[adapter_name].weight.data = lora_A + self.lora_embedding_B[adapter_name].weight.data = lora_B + self.get_base_layer().weight.data = qweight + + @torch.no_grad() + def orthogonal_init(self, adapter_name): + # https://datta0.github.io/posts/rethink-lora-init/#orthogonal-initialisation + rank = self.r[adapter_name] + if rank % 2 != 0: + raise ValueError(f"Orthogonal initialization requires the LoRA rank to be even, got {rank} instead.") + + X = torch.randn(rank, rank) + Q, _ = torch.linalg.qr(X) + q_odd = Q[0::2, :] # Odd rows + q_even = Q[1::2, :] # Even rows + dtype = self.get_base_layer().weight.dtype + lora_A = torch.randn(self.in_features, rank // 2).mm(q_odd).T / 10.0 + lora_B = torch.randn(rank // 2, self.out_features).T.mm(q_even) / 10.0 + self.lora_A[adapter_name].weight = nn.Parameter(lora_A.contiguous().to(dtype)) + self.lora_B[adapter_name].weight = nn.Parameter(lora_B.contiguous().to(dtype)) + + def _cache_store(self, key: str, value: Any) -> None: + self._caches[key] = value + + def _cache_pop(self, key: str) -> Any: + value = self._caches.pop(key) + return value + + def set_scale(self, adapter: str, scale: float | int) -> None: + """Set the scale of the given adapter to the initial scale multiplied by the provided factor + + The initial scale is determined by the configured `r` (rank) and `lora_alpha`. + """ + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + if self.use_rslora.get(adapter, False): + self.scaling[adapter] = scale * self.lora_alpha[adapter] / math.sqrt(self.r[adapter]) + else: + self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter] + + def scale_layer(self, scale: float | int) -> None: + """Multiply the current scale of all active adapters by the provided factor""" + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + self.scaling[active_adapter] *= scale + + def unscale_layer(self, scale: Optional[float | int] = None) -> None: + """Divide the current scale of all active adapters by the provided factor. If `scale=None` is passed, reset to + initial scale + + The initial scale is determined by the configured `r` (rank) and `lora_alpha`. + + """ + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + if scale is None: + if self.use_rslora.get(active_adapter, False): + self.scaling[active_adapter] = self.lora_alpha[active_adapter] / math.sqrt(self.r[active_adapter]) + else: + self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter] + else: + self.scaling[active_adapter] = self.scaling[active_adapter] / scale + + def _check_forward_args(self, x, *args, **kwargs): + """Check if the arguments are compatible with the configs and state of the model""" + adapter_names = kwargs.get("adapter_names", None) + if adapter_names is None: + return + + if len(x) != len(adapter_names): + msg = ( + "Length of `adapter_names` should be the same as the number of inputs, but got " + f"{len(adapter_names)} and {len(x)} respectively." + ) + raise ValueError(msg) + + if self.merged: + # It is unclear what would be the right thing to do if users pass adapter_names and there are merged + # adapters. Therefore, it is better to raise an error in this case. + msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first." + raise ValueError(msg) + + # DoRA is not supported (yet), check that it's not being used. Don't check "__base__", as this is the + # placeholder for the base model. + unique_adapters = {name for name in adapter_names if name != "__base__"} + for adapter_name in unique_adapters: + if self.use_dora.get(adapter_name, False): + msg = "Cannot pass `adapter_names` when DoRA is enabled." + raise ValueError(msg) + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + alora_offsets = variant_kwargs.get("alora_offsets", None) + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype) + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling + result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) + else: + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] + lora_output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=sub_batch, + result=result[sub_batch_indices_list[i]], + **variant_kwargs, + **kwargs, + ) + result[sub_batch_indices_list[i]] = lora_output.to(torch_result_dtype) + + return result + + +# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py +# and modified to work with PyTorch FSDP + + +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + + +class Linear(nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + use_alora: bool = False, + arrow_config: ArrowConfig = None, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_alora=use_alora, + lora_bias=lora_bias, + arrow_config=arrow_config, + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def resolve_lora_variant( + self, *, arrow_config: ArrowConfig, use_dora: bool, use_alora: bool, **kwargs + ) -> Optional[LoraVariant]: + if arrow_config is not None: + from .variants import ArrowLinearVariant + + return ArrowLinearVariant() + + if not use_dora and not use_alora: + return None + + from .variants import ALoraLinearVariant, DoraLinearVariant + + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + orig_dtype = orig_weight.dtype + if active_adapter not in self.lora_variant: # vanilla LoRA + delta_weight = self.get_delta_weight(active_adapter) + orig_weight += delta_weight.to(orig_dtype) + else: + orig_weight = self.lora_variant[active_adapter].merge_safe(self, active_adapter, orig_weight) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight + + if self.lora_bias[active_adapter]: + if getattr(base_layer, "bias", None) is None: + raise RuntimeError( + "Impossible to merge LoRA with `lora_bias=True` because the base layer has no bias." + ) + new_bias = base_layer.bias + self.lora_B[active_adapter].bias * self.scaling[active_adapter] + if not torch.isfinite(new_bias).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.bias.data = new_bias.to(orig_dtype) + + else: + if active_adapter not in self.lora_variant: # vanilla LoRA + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data += delta_weight + else: + self.lora_variant[active_adapter].merge_unsafe(self, active_adapter, base_layer.weight) + + if self.lora_bias[active_adapter]: + if getattr(base_layer, "bias", None) is None: + raise RuntimeError( + "Impossible to merge LoRA with `lora_bias=True` because the base layer has no bias." + ) + base_layer.bias.data += self.lora_B[active_adapter].bias * self.scaling[active_adapter] + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + weight = self.get_base_layer().weight + if active_adapter not in self.lora_variant: # vanilla LoRA + orig_dtype = weight.dtype + delta_weight = self.get_delta_weight(active_adapter) + weight.data -= delta_weight.to(orig_dtype) + else: + unmerged = self.lora_variant[active_adapter].unmerge(self, active_adapter, weight) + weight.data = unmerged + + if self.lora_bias[active_adapter]: + self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias * self.scaling[active_adapter] + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_B[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + + lora_A_keys = self.lora_A.keys() + for active_adapter in self.active_adapters: + if active_adapter not in lora_A_keys: + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = self._cast_input_dtype(x, lora_A.weight.dtype) + if active_adapter not in self.lora_variant: # vanilla LoRA + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **variant_kwargs, + **kwargs, + ) + + result = result.to(torch_result_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class Embedding(nn.Module, LoraLayer): + # LoRA implemented in a Embedding layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + arrow_config: ArrowConfig = None, + lora_bias: bool = False, + **kwargs, + ) -> None: + if lora_bias: + # lora_bias=True is not supported (yet) for embedding layers, as they use nn.Parameter + raise ValueError(f"lora_bias={lora_bias} is not supported for {self.__class__.__name__}.") + + super().__init__() + LoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + arrow_config=arrow_config, + ) + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: + return None + + from .variants import DoraEmbeddingVariant + + return DoraEmbeddingVariant() + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora, + lora_bias, + arrow_config: ArrowConfig = None, + inference_mode: bool = False, + **kwargs, + ): + # collect the kwargs + kwargs = locals().copy() + del kwargs["self"] + + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config) + if lora_variant is not None: + self.lora_variant[adapter_name] = lora_variant + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + weight_A = torch.randn((r, self.in_features)) + weight_B = torch.randn((self.out_features, r)) + self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A) + self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B) + self.lora_bias[adapter_name] = lora_bias + + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + self.use_rslora[adapter_name] = use_rslora + + self.use_dora[adapter_name] = use_dora + + if init_lora_weights == "loftq": + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + # call this before init of the lora variants + self._move_adapter_to_device_of_base_layer(adapter_name) + + if adapter_name in self.lora_variant: + self.lora_variant[adapter_name].init(self, **kwargs) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_embedding_A.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + if active_adapter not in self.lora_variant: # vanilla LoRA + orig_weight += self.get_delta_weight(active_adapter).to(orig_dtype) + else: + orig_weight = self.lora_variant[active_adapter].merge_safe(self, active_adapter, orig_weight) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight + else: + if active_adapter not in self.lora_variant: # vanilla LoRA + base_layer.weight.data += self.get_delta_weight(active_adapter).to(orig_dtype) + else: + self.lora_variant[active_adapter].merge_unsafe(self, active_adapter, base_layer.weight) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + orig_dtype = self.get_base_layer().weight.dtype + if active_adapter in self.lora_embedding_A.keys(): + weight = self.get_base_layer().weight + if active_adapter not in self.lora_variant: # vanilla LoRA + weight.data -= self.get_delta_weight(active_adapter).to(orig_dtype) + else: + unmerged = self.lora_variant[active_adapter].unmerge(self, active_adapter, weight) + weight.data = unmerged + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_embedding_B[adapter].device + dtype = self.lora_embedding_A[adapter].dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_A = self.lora_embedding_A[adapter] + weight_B = self.lora_embedding_B[adapter] + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_embedding_A[adapter] = weight_A.to(dtype) + self.lora_embedding_B[adapter] = weight_B.to(dtype) + + return output_tensor + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + result = self.base_layer(x, *args, **kwargs) + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_embedding_A.keys(): + continue + + embedding_A = self.lora_embedding_A[active_adapter].T + embedding_B = self.lora_embedding_B[active_adapter].T + scaling = self.scaling[active_adapter] + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]] + after_A = self._embed(sub_batch, embedding_A) + result[sub_batch_indices_list[i]] += (after_A @ embedding_B) * scaling + + return result + + def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + base_layer = self.get_base_layer() + return F.embedding( + input, + weight, + padding_idx=base_layer.padding_idx, + max_norm=base_layer.max_norm, + norm_type=base_layer.norm_type, + scale_grad_by_freq=base_layer.scale_grad_by_freq, + sparse=base_layer.sparse, + ) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # TODO: no dtype conversion here, unlike in Linear, is that correct? + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_embedding_A: + continue + + if active_adapter not in self.lora_variant: # vanilla LoRA + embedding_A = self.lora_embedding_A[active_adapter].T + embedding_B = self.lora_embedding_B[active_adapter].T + scaling = self.scaling[active_adapter] + after_A = self._embed(x, embedding_A) + result = result + (after_A @ embedding_B) * scaling + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **variant_kwargs, + **kwargs, + ) + result = result.to(torch_result_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class _ConvNd(nn.Module, LoraLayer): + # Lora implemented in a conv(2,3)d layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + arrow_config: ArrowConfig = None, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + if kwargs.get("use_alora", False): + raise ValueError("aLoRA does not support adapting conv layers.") + if base_layer.groups > 1: + warnings.warn("LoRA adapter added to ConvNd layer with groups > 1. Merging is not supported.") + + if r % base_layer.groups != 0: + raise ValueError( + f"Targeting a {base_layer.__class__.__name__} with groups={base_layer.groups} and rank {r}. " + "Currently, support is limited to conv layers where the rank is divisible by groups. " + "Either choose a different rank or do not target this specific layer." + ) + + self._active_adapter = adapter_name + self._kernel_dim = base_layer.weight.dim() + + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + arrow_config=arrow_config, + ) + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora, + lora_bias, + arrow_config: ArrowConfig = None, + inference_mode: bool = False, + **kwargs, + ): + # collect the kwargs + kwargs = locals().copy() + del kwargs["self"] + + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + if lora_bias and (getattr(self.get_base_layer(), "bias", None) is None): + warnings.warn( + f"`lora_bias=True` was passed but the targeted layer of type {type(self.get_base_layer()).__name__} " + "has no bias. This means that merging LoRA weights won't be possible.", + PeftWarning, + ) + + lora_variant = self.resolve_lora_variant(use_dora=use_dora, arrow_config=arrow_config) + if lora_variant is not None: + self.lora_variant[adapter_name] = lora_variant + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + base_layer = self.get_base_layer() + kernel_size = base_layer.kernel_size + stride = base_layer.stride + padding = base_layer.padding + conv_layer = type(base_layer) + out_kernel = out_stride = (1,) * (self._kernel_dim - 2) + self.lora_A[adapter_name] = conv_layer(self.in_features, r, kernel_size, stride, padding, bias=False) + self.lora_B[adapter_name] = conv_layer( + r, self.out_features, out_kernel, out_stride, groups=base_layer.groups, bias=lora_bias + ) + self.lora_bias[adapter_name] = lora_bias + + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + self.use_rslora[adapter_name] = use_rslora + + self.use_dora[adapter_name] = use_dora + + if init_lora_weights == "loftq": + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + # call this before init of the lora variants + self._move_adapter_to_device_of_base_layer(adapter_name) + + if adapter_name in self.lora_variant: + self.lora_variant[adapter_name].init(self, **kwargs) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def _get_dora_factor_view(self): + return (-1,) + (1,) * (self._kernel_dim - 1) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights inside the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + + if base_layer.groups > 1: + # https://github.com/huggingface/peft/pull/2403 + raise NotImplementedError("Merging is not supported for _ConvNd layers with groups > 1!") + + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + if active_adapter not in self.lora_variant: # vanilla LoRA + delta_weight = self.get_delta_weight(active_adapter) + orig_weight += delta_weight.to(orig_dtype) + else: + orig_weight = self.lora_variant[active_adapter].merge_safe(self, active_adapter, orig_weight) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight + + if self.lora_bias[active_adapter]: + if getattr(base_layer, "bias", None) is None: + raise RuntimeError( + "Impossible to merge LoRA with `lora_bias=True` because the base layer has no bias." + ) + new_bias = base_layer.bias + self.lora_B[active_adapter].bias * self.scaling[active_adapter] + if not torch.isfinite(new_bias).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.bias.data = new_bias.to(orig_dtype) + + else: + if active_adapter not in self.lora_variant: # vanilla LoRA + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data += delta_weight.to(orig_dtype) + else: + self.lora_variant[active_adapter].merge_unsafe(self, active_adapter, base_layer.weight) + + if self.lora_bias[active_adapter]: + if getattr(base_layer, "bias", None) is None: + raise RuntimeError( + "Impossible to merge LoRA with `lora_bias=True` because the base layer has no bias." + ) + base_layer.bias.data += self.lora_B[active_adapter].bias * self.scaling[active_adapter] + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + weight = self.get_base_layer().weight + if active_adapter not in self.lora_variant: # vanilla LoRA + orig_dtype = weight.dtype + delta_weight = self.get_delta_weight(active_adapter) + weight.data -= delta_weight.to(orig_dtype) + else: + unmerged = self.lora_variant[active_adapter].unmerge(self, active_adapter, weight) + weight.data = unmerged + + if self.lora_bias[active_adapter]: + self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias * self.scaling[active_adapter] + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_A[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117 + if self.get_base_layer().weight.size()[2:4] == (1, 1): + # conv2d 1x1 + output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze( + 3 + ) * self.scaling[adapter] + else: + output_tensor = self.conv_fn(weight_A.transpose(0, 1), weight_B) + + if self.get_base_layer().groups > 1: + output_tensor = output_tensor * self.scaling[adapter] + else: + output_tensor = output_tensor.transpose(0, 1) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = self._cast_input_dtype(x, lora_A.weight.dtype) + + if active_adapter not in self.lora_variant: # vanilla LoRA + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + result = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **variant_kwargs, + **kwargs, + ) + + result = result.to(torch_result_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class Conv2d(_ConvNd): + # Lora implemented in a conv2d layer + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self._kernel_dim == 4: + raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}") + self.conv_fn = F.conv2d + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: + return None + + from .variants import DoraConv2dVariant + + return DoraConv2dVariant() + + +class Conv1d(_ConvNd): + # Lora implemented in a conv1d layer + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self._kernel_dim == 3: + raise ValueError(f"Conv1d layer kernel must have 3 dimensions, not {self._kernel_dim}") + self.conv_fn = F.conv1d + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: + return None + + from .variants import DoraConv1dVariant + + return DoraConv1dVariant() + + +class Conv3d(_ConvNd): + # Lora implemented in a conv3d layer + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if not self._kernel_dim == 5: + raise ValueError(f"Conv3d layer kernel must have 5 dimensions, not {self._kernel_dim}") + self.conv_fn = F.conv3d + + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: + return None + + from .variants import DoraConv3dVariant + + return DoraConv3dVariant() + + +class MultiheadAttention(nn.Module, LoraLayer): + """LoRA implemented in a multihead attention layer + + This is currently only implemented for the case of `_qkv_same_embed_dim = True`, i.e. query, key, and value having + the same dimension. + + Note: LoRA is applied to both the in_proj (query/key/value) and out_proj. There is currently no way to specify only + one of them. Don't try to apply LoRA to the out_proj of MultiheadAttention by targeting that layer specifically, + since the forward method of that layer is not being used, hence the LoRA adapter would be ignored. + + This is a little bit hacky because of the way that MultiheadAttention is implemented in PyTorch: There are no + `nn.Linear` layers which we can hook onto or, in case of output projection, `.forward` is not used. This + implementation works around these problems by merging the weights before the forward call and unmerging them after + the forward call. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + **kwargs, + ) -> None: + # TODO work with separate weights + if not getattr(base_layer, "_qkv_same_embed_dim", True): + # default for this value appears to be True: + # https://github.com/pytorch/pytorch/blob/701ba5203fe68d55d655bd4d6c008be94cf34ea5/torch/nn/modules/activation.py#L1128-L1130 + raise ValueError( + f"Only same embed for query/key/value is supported as of now for {self.__class__.__name__}." + ) + if use_dora: + # TODO: probably not so hard to implement + raise ValueError(f"{self.__class__.__name__} does not support DoRA (yet), please set use_dora to False") + if kwargs.get("use_alora", False): + raise ValueError(f"{self.__class__.__name__} does not support aLoRA (yet), please set use_alora to False") + super().__init__() + LoraLayer.__init__(self, base_layer, **kwargs) + + # Note: LoRA is applied to both in_proj and out_proj. There is currently no way to only specify one of them. + if isinstance(base_layer.out_proj, nn.Linear): + self.base_layer.out_proj = Linear( + base_layer.out_proj, + adapter_name, + r=r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + **kwargs, + ) + else: + raise ValueError(f"out_proj must be an instance of nn.Linear for {self.__class__.__name__}.") + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora) + + @property + def embed_dim(self) -> int: + return self.get_base_layer().embed_dim + + @property + def kdim(self) -> Optional[int]: + return self.get_base_layer().kdim + + @property + def vdim(self) -> Optional[int]: + return self.get_base_layer().vdim + + @property + def _qkv_same_embed_dim(self) -> bool: + return self.get_base_layer()._qkv_same_embed_dim + + @property + def num_heads(self) -> int: + return self.get_base_layer().num_heads + + @property + def dropout(self) -> float: + return self.get_base_layer().dropout + + @property + def batch_first(self) -> bool: + return self.get_base_layer().batch_first + + @property + def head_dim(self) -> int: + return self.get_base_layer().head_dim + + @property + def in_proj_weight(self) -> nn.Parameter: + return self.get_base_layer().in_proj_weight + + @property + def in_proj_bias(self) -> nn.Parameter: + return self.get_base_layer().in_proj_bias + + @property + def out_proj(self) -> nn.Module: + return self.get_base_layer().out_proj.get_base_layer() + + @property + def bias_k(self) -> Optional[nn.Parameter]: + return self.get_base_layer().bias_k + + @property + def bias_v(self) -> Optional[nn.Parameter]: + return self.get_base_layer().bias_v + + def merge_masks(self, *args, **kwargs) -> tuple[Optional[torch.Tensor], Optional[int]]: + return self.get_base_layer().merge_masks(*args, **kwargs) + + @property + def add_zero_attn(self) -> bool: + return self.get_base_layer().add_zero_attn + + def update_layer(self, *args, **kwargs) -> None: + super().update_layer(*args, **kwargs) + # Note: LoRA is applied to both in_proj and out_proj. There is currently no way to only specify one of them. + self.base_layer.out_proj.update_layer(*args, **kwargs) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + # Implementation follows this: + # https://github.com/Baijiong-Lin/LoRA-Torch/blob/4bfed6820b64fcf47064c30f30606a190a4f0d2e/loratorch/layers.py#L73-L79 + # Notably, instead of mutating the weight, we delete the original weight and replace it by the merged weight + # TODO: work with separate weights + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.out_proj.weight.dtype + if safe_merge: + # TODO: work with separate weights + # merging in_proj (nn.Parameter) + orig_weight_in = base_layer.in_proj_weight.data.detach().clone() + orig_weight_in += self.get_delta_weight(active_adapter).to(orig_dtype) + if not torch.isfinite(orig_weight_in).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + # merging out_proj (subclass of nn.Linear) + orig_weight_out = base_layer.out_proj.weight.data.detach().clone() + orig_weight_out += base_layer.out_proj.get_delta_weight(active_adapter).to(orig_dtype) + if not torch.isfinite(orig_weight_out).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + # unregister parameter implicitly and overwrite using merged weights; gradients are computed after + # forward and, thus, after unmerging (see forward()), therefore this is safe to do. + del base_layer.in_proj_weight + base_layer.in_proj_weight = orig_weight_in + + del base_layer.out_proj.get_base_layer().weight + base_layer.out_proj.get_base_layer().weight = orig_weight_out + base_layer.out_proj.merge(adapter_names=[active_adapter]) + else: + # merging in_proj (nn.Parameter) + # TODO: work with separate weights + delta_weight = self.get_delta_weight(active_adapter).to(orig_dtype) + weight_merged = base_layer.in_proj_weight.data.detach() + delta_weight + + # unregister parameter implicitly and overwrite using merged weights; gradients are computed after + # forward and, thus, after unmerging (see forward()), therefore this is safe to do. + del base_layer.in_proj_weight + base_layer.in_proj_weight = weight_merged + + # merging out_proj (subclass of nn.Linear) + delta_weight = base_layer.out_proj.get_delta_weight(active_adapter).to(orig_dtype) + weight_merged = base_layer.out_proj.weight.data.detach() + delta_weight + del base_layer.out_proj.get_base_layer().weight + base_layer.out_proj.get_base_layer().weight = weight_merged + base_layer.out_proj.merge(adapter_names=[active_adapter]) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + # TODO work with separate weights + base_layer = self.get_base_layer() + orig_dtype = base_layer.out_proj.base_layer.weight.dtype + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + # Ensure that requires_grad=False for the base weights after unmerging. This may not matter since + # requires_grad was False when the optimizer was initialized, but still let's try to be correct here. + + # in_proj + delta_weight = self.get_delta_weight(active_adapter).to(orig_dtype) + old_weight = base_layer.in_proj_weight.data - delta_weight + del base_layer.in_proj_weight + base_layer.register_parameter("in_proj_weight", nn.Parameter(old_weight, requires_grad=False)) + + # out_proj + delta_weight = base_layer.out_proj.get_delta_weight(active_adapter).to(orig_dtype) + old_weight = base_layer.out_proj.base_layer.weight.data - delta_weight + del base_layer.out_proj.base_layer.weight + base_layer.out_proj.base_layer.register_parameter( + "weight", nn.Parameter(old_weight, requires_grad=False) + ) + + self.get_base_layer().out_proj.unmerge() + + def unload_and_optionally_merge_module( + self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]] + ) -> nn.MultiheadAttention: + """ + Merging and unloading of the MultiheadAttention module + + This requires an extra step for MultiheadAttention, which is why there is this special method instead of + relying on the normal merge_and_unload code path. + """ + if merge: + self.merge(safe_merge=safe_merge, adapter_names=adapter_names) + base_layer = self.get_base_layer() + + # extra steps: re-register weights, take care of out_proj layer + # in_proj + weight = base_layer.in_proj_weight + del base_layer.in_proj_weight + base_layer.register_parameter("in_proj_weight", nn.Parameter(weight.data, requires_grad=weight.requires_grad)) + + # out_proj + out_proj_layer = base_layer.out_proj.get_base_layer() + weight = out_proj_layer.weight + del out_proj_layer.weight + out_proj_layer.register_parameter("weight", nn.Parameter(weight.data, requires_grad=weight.requires_grad)) + + base_layer.out_proj = out_proj_layer + return base_layer + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_B[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16. + cast_to_fp32 = device.type == "cpu" and dtype == torch.float16 + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = (weight_B @ weight_A) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def _check_forward_args(self, x, *args, **kwargs): + if "adapter_names" in kwargs: + raise TypeError(f"lora.{self.__class__.__name__} does not support mixed adapter batches.") + super()._check_forward_args(x, *args, **kwargs) + + def forward(self, query: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = query.dtype + self._check_forward_args(query, *args, **kwargs) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(query, *args, **kwargs) + elif self.merged: + result = self.base_layer(query, *args, **kwargs) + else: + out_proj = self.get_base_layer().out_proj + if out_proj.active_adapters != self.active_adapters: + # We have a case that in_proj and out_proj have diverging merged adapters. We cannot + # really deal with this correctly, thus it's better to raise than possibly create a hard to debug mess + cls_name = self.get_base_layer().__class__.__name__ + raise ValueError( + f"The out_proj layer of {cls_name} has merged layers but {cls_name} itself doesn't; please ensure " + "that either both or none have merged layers" + ) + + # Merge all adapters that are active for this module, i.e. the LoRA weights for in_proj and out_proj. + # in_proj uses nn.Parameters, therefore, there is no forward method to be used and we have to explicitly + # merge for the LoRA weights to have an effect: + # https://github.com/pytorch/pytorch/blob/6ebb26d572d5fcdc6ac0d1297bdf8d1eb5d20722/torch/nn/modules/activation.py#L1020 + # For out_proj, we have an nn.Linear (or rather: NonDynamicallyQuantizableLinear), but its forward method + # is not used: + # https://github.com/pytorch/pytorch/blob/6ebb26d572d5fcdc6ac0d1297bdf8d1eb5d20722/torch/nn/modules/activation.py#L1267-L1271 + # Therefore, its LoRA weights also need to be merged to have an effect. + active_adapters = [a for a in self.active_adapters if a in self.lora_A] + try: + self.merge(adapter_names=active_adapters) + result = self.base_layer(query, *args, **kwargs) + finally: + # it's safe to call unmerge(), which unmerges all adapters, because we checked that not self.merged, + # i.e. there is was no merged layer before + self.unmerge() + + result = (result[0].to(previous_dtype), result[1].to(previous_dtype) if result[1] is not None else result[1]) + return result + + # The decorator is needed in case low_cpu_mem_usage=True is used, as we don't want the base layer weights to be + # moved to meta device. This requires the use of PEFT's implementation of init_empty_weight instead of using the one + # from accelerate. + @skip_init_on_device + def _restore_weights(self): + # Restore the weights as registered parameters on the base layer. + # This is necessary because the way that weights are merged/unmerged (which is necessary for forward to work + # correctly), the Module "forgets" these attributes. Therefore, we need to call register_parameter explicitly. + # We cannot call register_parameter for merging/unmerging because that cuts them off from the autograd graph. + # Note that this is hacky, since we need to ensure that _restore_weights is called by each method that needs it. + + # in_proj + # TODO work with separate weights + base_layer = self.get_base_layer() + weight = base_layer.in_proj_weight + del base_layer.in_proj_weight + base_layer.register_parameter("in_proj_weight", nn.Parameter(weight.data, requires_grad=weight.requires_grad)) + + # out_proj + base_layer = base_layer.out_proj.get_base_layer() + weight = base_layer.weight + del base_layer.weight + base_layer.register_parameter("weight", nn.Parameter(weight.data, requires_grad=weight.requires_grad)) + + def state_dict(self, *args, **kwargs): + self._restore_weights() + return super().state_dict(*args, **kwargs) + + def named_modules(self, *args, **kwargs): + # Note: no need to also implement modules(), as modules() calls named_modules() under the hood + self._restore_weights() + return super().named_modules(*args, **kwargs) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class _LoraParameterProxy(nn.Module): + """This proxies an `nn.Parameter` that is targeted with LoRA. + Intended to be used in conjunction with `nn.utils.parametrize`, see `ParamWrapper`. + """ + + def __init__(self, delta_weight): + super().__init__() + self.delta_weight = delta_weight + + def forward(self, W): + with nn.utils.parametrize.cached(): + return W + self.delta_weight + + +# copied from: +# https://github.com/pytorch/pytorch/blob/5e386eec9426f174eea130c0c012d9f65ebe65fb/torch/nn/utils/parametrize.py#L75-L79 +def _register_parameter_or_buffer(module, name, X): + if isinstance(X, nn.Parameter): + module.register_parameter(name, X) + else: + module.register_buffer(name, X) + + +class ParamWrapper(nn.Module, LoraLayer): + """A LoRA wrapper for `nn.Parameter`. This layer is dispatched if users target a parameter directly with + `lora_config.target_parameters` + Note: + - When accessing the wrapped nn.Parameter directly, e.g. via `module.weight`, the LoRA weights are *not* + applied. + - It is currently not implemented to target multiple parameters on the same module. To achieve this, it is + currently required to create a separate LoRA adapter (with another adapter name) and activate both at the + same time. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + parameter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer, **kwargs) + self.parameter_name = parameter_name + param = self.get_param() + if param.ndim == 3: + self.num_experts, self.in_features, self.out_features = param.shape + else: + self.num_experts, self.in_features, self.out_features = 1, param.shape[1], param.shape[0] + + if param.ndim not in (2, 3): + raise ValueError( + f"lora.{self.__class__.__name__} was initialized with {param.ndim} dimensional Parameter, but only 2d " + "and 3d are supported." + ) + if lora_dropout: + # It's not possible to factor out x from lora_B(lora_A(dropout(x))), so dropout can't be correctly + # implemented + raise ValueError(f"lora.{self.__class__.__name__} does not work with lora_dropout != 0.") + if fan_in_fan_out: + raise ValueError(f"lora.{self.__class__.__name__} does not work with fan_in_fan_out.") + if lora_bias: + raise ValueError(f"lora.{self.__class__.__name__} does not work with lora_bias=True.") + if use_dora: + raise ValueError(f"lora.{self.__class__.__name__} does not work with use_dora=True.") + if is_target_conv_1d_layer: + raise ValueError(f"lora.{self.__class__.__name__} does not work with is_target_conv_1d_layer=True.") + + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora: bool = False, + use_qalora: bool = False, + lora_bias: bool = False, + qalora_group_size: int = 32, + inference_mode: bool = False, + **kwargs, + ): + # same method as in lora.Linear but taking into account that there can be multiple experts (3d parameter) + # collect the kwargs + kwargs = locals().copy() + del kwargs["self"] + + # This code works for linear layers, override for other layer types + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + lora_variant = self.resolve_lora_variant( + use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size + ) + if lora_variant is not None: + raise ValueError(f"lora.{self.__class__.__name__} does not work with LoRA variants like DoRA.") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + # It's not possible to factor out x from lora_B(lora_A(dropout(x))), so dropout can't be correctly + # implemented + raise ValueError(f"lora.{self.__class__.__name__} does not work with lora_dropout != 0.") + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer})) + # Actual trainable parameters + # Difference to normal update_layer: consider experts. LoRA layers still use nn.Linear for consistency with + # lora.Linear. + self.lora_A[adapter_name] = nn.Linear(self.in_features, r * self.num_experts, bias=False) + self.lora_B[adapter_name] = nn.Linear(r * self.num_experts, self.out_features, bias=lora_bias) + self.lora_bias[adapter_name] = lora_bias + + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + self.use_rslora[adapter_name] = use_rslora + + self.use_dora[adapter_name] = use_dora + + # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed + if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"): + with gather_params_ctx(self.get_base_layer().weight): + self.pissa_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.startswith("corda"): + with gather_params_ctx(self.get_base_layer().weight): + self.corda_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": + with gather_params_ctx(self.get_base_layer().weight): + self.olora_init(adapter_name) + elif init_lora_weights == "loftq": + with gather_params_ctx(self.get_base_layer().weight): + self.loftq_init(adapter_name) + elif init_lora_weights == "eva": + nn.init.zeros_(self.lora_B[adapter_name].weight) + elif init_lora_weights == "orthogonal": + with gather_params_ctx(self.get_base_layer().weight): + self.orthogonal_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + # call this before init of the lora variants + self._move_adapter_to_device_of_base_layer(adapter_name) + + if adapter_name in self.lora_variant: + self.lora_variant[adapter_name].init(self, **kwargs) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: + """ + Move the adapter of the given name to the device of the base layer. Needs special handling for nn.Parameter + """ + device = self.get_param().device + meta = torch.device("meta") + param = self.get_param() + + for adapter_layer_name in self.adapter_layer_names + self.other_param_names: + adapter_layer = getattr(self, adapter_layer_name, None) + if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)): + continue + if adapter_name not in adapter_layer: + continue + if any(p.device == meta for p in adapter_layer.parameters()): + continue + + if param.dtype.is_floating_point or param.dtype.is_complex: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=param.dtype) + else: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device) + + def get_param(self): + param = getattr(self.get_base_layer(), self.parameter_name) + return param + + def get_delta_weight(self, adapter_name, *args, **kwargs): + if self.num_experts == 1: + delta_weight = Linear.get_delta_weight(self, adapter_name, *args, **kwargs) + else: + weight_A = self.lora_A[adapter_name].weight + weight_B = self.lora_B[adapter_name].weight + # shape: experts x rank x in_features + weight_A = weight_A.reshape(self.num_experts, -1, weight_A.shape[-1]) + # shape: out_features x rank x experts + weight_B = weight_B.reshape(weight_B.shape[0], -1, self.num_experts) + # fan_in_fan_out must be False, so no transpose call here + delta_weight = torch.einsum("o r e, e r i -> e i o", weight_B, weight_A) * self.scaling[adapter_name] + + base_layer = self.get_base_layer() + param = self.get_param() + delta_weight = delta_weight.to(param.device, param.dtype) + return delta_weight + + @contextmanager + def _activate_lora(self, active_adapters: list[str]): + if not active_adapters or not any(adapter in self.lora_A for adapter in active_adapters): + # no active adapters for this layer + yield + return + + delta_weight = None + for active_adapter in active_adapters: + if active_adapter not in self.lora_A: + continue + if delta_weight is None: + delta_weight = self.get_delta_weight(active_adapter) + else: + delta_weight = delta_weight + self.get_delta_weight(active_adapter) + + base_layer = self.get_base_layer() + requires_grad_before = self.get_param().requires_grad + nn.utils.parametrize.register_parametrization( + base_layer, self.parameter_name, _LoraParameterProxy(delta_weight) + ) + # set requires_grad, as it defaults to False + base_layer.parametrizations[self.parameter_name].original.requires_grad_(requires_grad_before) + try: + yield + finally: + self._remove_parametrizations() + + def _remove_parametrizations(self): + # Remove the parametrization of this specific parameter + base_layer = self.get_base_layer() + parameter_name = self.parameter_name + if parameter_name not in base_layer.parametrizations: + raise ValueError( + "Something went wrong, please report this issue on PEFT: https://github.com/huggingface/peft/issues" + ) + + param_list = base_layer.parametrizations[parameter_name] + if len(param_list) == 1: + # last parametrization, we can safely remove it completely + nn.utils.parametrize.remove_parametrizations(base_layer, parameter_name, leave_parametrized=False) + return + + # If there are multiple parametrizations for the same parameter_name, we only want to remove the LoRA proxy. + # Unfortunately, PyTorch does not support this directly, so we need to take care of it manually. To achieve + # this, we check the ParameterList from the back until we find the _LoraParameterProxy instance and then remove + # it. + reversed_indices = reversed(range(len(param_list))) + for i in reversed_indices: + module = param_list[i] + if isinstance(module, _LoraParameterProxy): + del param_list[i] + break + else: # no break encountered + # this should not happen, but raising an error is probably not necessary + warnings.warn( + f"Could not find any LoRA parametrization on {self}, please open an issue on " + "https://github.com/huggingface/peft/issues and report this warning." + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + # same as lora.Linear.merge but not hard-coding base_layer.weight and without special cases like variants removed + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + param = getattr(base_layer, self.parameter_name) + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = param.data.clone() + orig_dtype = orig_weight.dtype + delta_weight = self.get_delta_weight(active_adapter) + orig_weight += delta_weight.to(orig_dtype) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + param.data = orig_weight + + else: + delta_weight = self.get_delta_weight(active_adapter) + param.data += delta_weight + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + # same as lora.Linear.unmerge but not hard-coding base_layer.weight and without special cases like variants removed + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + param = getattr(self.get_base_layer(), self.parameter_name) + orig_dtype = param.dtype + delta_weight = self.get_delta_weight(active_adapter) + param.data -= delta_weight.to(orig_dtype) + + def _check_forward_args(self, x, *args, **kwargs): + """Check if the arguments are compatible with the configs and state of the model""" + if kwargs.get("adapter_names", None): + raise ValueError(f"lora.{self.__class__.__name__} does not support mixed adapter batches yet.") + super()._check_forward_args(x, *args, **kwargs) + + def unload_and_optionally_merge_module(self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]]): + base_layer = self.base_layer + # ParamWrappers can be nested, so merge and retrieve base layer recursively + if merge: + self.merge(safe_merge=safe_merge, adapter_names=adapter_names) + while isinstance(base_layer, ParamWrapper): + base_layer.merge(safe_merge=safe_merge, adapter_names=adapter_names) + base_layer = base_layer.base_layer + else: + base_layer = self.get_base_layer() + return base_layer + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + raise ValueError(f"lora.{self.__class__.__name__} does not support mixed batch inference") + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + with self._activate_lora(self.active_adapters): + result = self.base_layer(x, *args, **kwargs) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + idx = rep.find("(") + 1 + # insert the name of the parameter to allow the repr to be disambiguous when multiple parameters on the same + # module are being targeted + rep = f"{rep[:idx]}\n parameter_name='{self.parameter_name}',{rep[idx:]}" + return "lora." + rep + + +def dispatch_default( + target: torch.nn.Module, + adapter_name: str, + lora_config: LoraConfig, + parameter_name: Optional[str] = None, + **kwargs, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if parameter_name is not None: + new_module = ParamWrapper(target, adapter_name, parameter_name=parameter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Embedding): + embedding_kwargs = kwargs.copy() + embedding_kwargs.pop("fan_in_fan_out", None) + embedding_kwargs.update(lora_config.loftq_config) + new_module = Embedding(target, adapter_name, **embedding_kwargs) + elif isinstance(target_base_layer, torch.nn.Conv2d): + kwargs.update(lora_config.loftq_config) + new_module = Conv2d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Conv3d): + kwargs.update(lora_config.loftq_config) + new_module = Conv3d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, nn.Conv1d): + kwargs.update(lora_config.loftq_config) + new_module = Conv1d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.MultiheadAttention): + kwargs.update(lora_config.loftq_config) + new_module = MultiheadAttention(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/lora/model.py b/peft/src/peft/tuners/lora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..f03b45640d9f93b7e3afca2d4a4ae847b2a9cc96 --- /dev/null +++ b/peft/src/peft/tuners/lora/model.py @@ -0,0 +1,807 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import operator +import warnings +from contextlib import contextmanager +from dataclasses import replace +from functools import partial, reduce +from typing import Literal, Optional + +import torch +from torch import nn + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import ( + BaseTuner, + BaseTunerLayer, + replicate_layers, +) +from peft.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + AuxiliaryTrainingWrapper, + ModulesToSaveWrapper, + _freeze_adapter, + _get_submodules, + get_peft_model_state_dict, + get_quantization_config, +) +from peft.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties +from peft.utils.other import get_pattern_key + +from .aqlm import dispatch_aqlm +from .awq import dispatch_awq +from .config import LoraConfig +from .eetq import dispatch_eetq +from .gptq import dispatch_gptq +from .hqq import dispatch_hqq +from .inc import dispatch_inc +from .layer import Conv2d, LoraLayer, ParamWrapper, dispatch_default +from .torchao import dispatch_torchao +from .tp_layer import dispatch_megatron + + +def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names): + # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference + kwargs["adapter_names"] = adapter_names + return args, kwargs + + +def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): + kwargs["alora_offsets"] = alora_offsets + return args, kwargs + + +class LoraModel(BaseTuner): + """ + Creates Low Rank Adapter (LoRA) model from a pretrained transformers model. + + The method is described in detail in https://huggingface.co/papers/2106.09685. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`LoraConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The Lora model. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM + >>> from peft import LoraModel, LoraConfig + + >>> config = LoraConfig( + ... task_type="SEQ_2_SEQ_LM", + ... r=8, + ... lora_alpha=32, + ... target_modules=["q", "v"], + ... lora_dropout=0.01, + ... ) + + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> lora_model = LoraModel(model, config, "default") + ``` + + ```py + >>> import torch + >>> import transformers + >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training + + >>> rank = ... + >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"] + >>> config = LoraConfig( + ... r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" + ... ) + >>> quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True) + + >>> tokenizer = transformers.AutoTokenizer.from_pretrained( + ... "kakaobrain/kogpt", + ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b + ... bos_token="[BOS]", + ... eos_token="[EOS]", + ... unk_token="[UNK]", + ... pad_token="[PAD]", + ... mask_token="[MASK]", + ... ) + >>> model = transformers.GPTJForCausalLM.from_pretrained( + ... "kakaobrain/kogpt", + ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b + ... pad_token_id=tokenizer.eos_token_id, + ... use_cache=False, + ... device_map={"": rank}, + ... torch_dtype=torch.float16, + ... quantization_config=quantization_config, + ... ) + >>> model = prepare_model_for_kbit_training(model) + >>> lora_model = get_peft_model(model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`LoraConfig`]): The configuration of the Lora model. + """ + + prefix: str = "lora_" + tuner_layer_cls = LoraLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING + + def _prepare_model(self, peft_config: LoraConfig, model: nn.Module): + r""" + A private method to modify the model structure before adapter is applied. + + Args: + peft_config (`PeftConfig`): + The prepared adapter config. + model (`nn.Module`): + The model that is going to be adapted. + """ + if peft_config.layer_replication: + replicate_layers(model, peft_config.layer_replication) + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + *, + parameter_name: Optional[str] = None, + ) -> None: + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + if lora_config.target_parameters: + # Right now, unfortunately, we don't support multiple adapters with target_parameters on the same model. + other_configs_use_target_params = any( + conf.target_parameters for key, conf in self.peft_config.items() if key != adapter_name + ) + if other_configs_use_target_params: + raise ValueError( + f"Adding a LoRA config with `target_parameters={lora_config.target_parameters}` but there are " + "already other LoRA adapters on this model that use `target_parameters`. At the moment, only " + "one LoRA adapter per model with `target_parameters` is allowed." + ) + + # Regexp matching - Find key which matches current target_name in patterns provided + r_key = get_pattern_key(lora_config.rank_pattern.keys(), current_key) + alpha_key = get_pattern_key(lora_config.alpha_pattern.keys(), current_key) + r = lora_config.rank_pattern.get(r_key, lora_config.r) + alpha = lora_config.alpha_pattern.get(alpha_key, lora_config.lora_alpha) + + kwargs = { + "r": r, + "lora_alpha": alpha, + "lora_dropout": lora_config.lora_dropout, + "fan_in_fan_out": lora_config.fan_in_fan_out, + "init_lora_weights": lora_config.init_lora_weights, + "use_rslora": lora_config.use_rslora, + "use_dora": lora_config.use_dora, + "use_alora": lora_config.alora_invocation_tokens is not None, + "use_qalora": lora_config.use_qalora, + "qalora_group_size": lora_config.qalora_group_size, + "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload, + "lora_bias": lora_config.lora_bias, + "arrow_config": lora_config.arrow_config, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + "parameter_name": parameter_name, + } + + # for torchao merging, we need the get_apply_tensor_subclass from the quantization config + try: + kwargs["get_apply_tensor_subclass"] = operator.attrgetter( + "hf_quantizer.quantization_config.get_apply_tensor_subclass" + )(self.model) + except AttributeError: + pass + + quant_methods = ["gptq", "aqlm", "awq"] + for quant_method in quant_methods: + quantization_config = get_quantization_config(self.model, method=quant_method) + if quantization_config is not None: + kwargs[f"{quant_method}_quantization_config"] = quantization_config + + # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it + from peft.tuners.adalora import AdaLoraLayer + + # if the target is a ParamWrapper, we nest it to allow targeting multiple nn.Parameter on the same module + wrap_target_param = isinstance(target, ParamWrapper) and (adapter_name in target.lora_A) + if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer) and not wrap_target_param: + target.update_layer( + adapter_name, + r, + lora_alpha=alpha, + lora_dropout=lora_config.lora_dropout, + init_lora_weights=lora_config.init_lora_weights, + use_rslora=lora_config.use_rslora, + use_dora=lora_config.use_dora, + lora_bias=lora_config.lora_bias, + arrow_config=lora_config.arrow_config, + inference_mode=lora_config.inference_mode, + ) + else: + if isinstance(target, ParamWrapper) and (parameter_name == target.parameter_name): + raise ValueError( + "Trying to target the same nn.Parameter twice, this should not happen. Please open an issue on the " + "PEFT repo: https://github.com/huggingface/peft/issues" + ) + device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None + new_module = self._create_new_module(lora_config, adapter_name, target, device_map=device_map, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + def _replace_module(self, parent, child_name, new_module, child): + # override in LoraModel to handle quantized weights properly + + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + meta = torch.device("meta") + # dispatch to correct device + for name, module in new_module.named_modules(): + if (self.prefix in name) or ("ranknum" in name): + if hasattr(child, "qweight"): + weight = child.qweight + elif hasattr(child, "W_q"): + weight = child.W_q + elif hasattr(child, "weight"): + weight = child.weight + elif getattr(child, "in_proj_weight", None) is not None: # MHA + weight = child.in_proj_weight + else: + weight = next(child.parameters()) + if not any(p.device == meta for p in module.parameters()): + module.to(weight.device) + + @staticmethod + def _create_new_module(lora_config, adapter_name, target, **kwargs): + # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters, + # because the first match is always used. Therefore, the default layers should be checked last. + dispatchers = [] + + if lora_config._custom_modules: + # Experimental custom LoRA module support. Allows users to pass a custom mapping for unsupported layer + # types by impelementing their own LoRA layers. + def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + for key, custom_cls in lora_config._custom_modules.items(): + if isinstance(target_base_layer, key): + new_module = custom_cls(target, adapter_name, **kwargs) + break + + return new_module + + dispatchers.append(dynamic_dispatch_func) + + # avoid eager bnb import + if is_bnb_available(): + from .bnb import dispatch_bnb_8bit + + dispatchers.append(dispatch_bnb_8bit) + + if is_bnb_4bit_available(): + from .bnb import dispatch_bnb_4bit + + dispatchers.append(dispatch_bnb_4bit) + + dispatchers.extend( + [ + dispatch_eetq, + dispatch_aqlm, + dispatch_awq, + dispatch_gptq, + dispatch_hqq, + dispatch_inc, + dispatch_torchao, + dispatch_megatron, + dispatch_default, + ] + ) + + new_module = None + for dispatcher in dispatchers: + new_module = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs) + if new_module is not None: # first match wins + break + + if new_module is None: + # no module could be matched + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv1d`, `torch.nn.Conv2d`, `torch.nn.Conv3d`, " + "`transformers.pytorch_utils.Conv1D`, `torch.nn.MultiheadAttention.`." + ) + + return new_module + + @contextmanager + def _enable_peft_forward_hooks(self, *args, **kwargs): + # If adapter_names is passed as an argument, we inject it into the forward arguments. + adapter_names = kwargs.pop("adapter_names", None) + alora_offsets = kwargs.pop("alora_offsets", None) + if adapter_names is None and alora_offsets is None: + # nothing to do + yield + return + hook_handles = [] + if alora_offsets is not None: + for layer in self.modules(): + if isinstance(layer, LoraLayer): + pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets=alora_offsets) + handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + num_beams = kwargs.get("num_beams", None) + uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) + if uses_beam_search: + if alora_offsets is not None: + raise ValueError("Beam search not yet supported for aLoRA.") + if adapter_names is not None: + if self.training: + raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") + + # Check that users only passed actually existing adapters. + # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want + # to check that there is at least one layer with the given name, or else something like typos can easily slip. + expected_adapters = set() + for layer in self.modules(): + if isinstance(layer, LoraLayer): + expected_adapters |= layer.lora_A.keys() + expected_adapters |= layer.lora_embedding_A.keys() + unique_adapters = {name for name in adapter_names if name != "__base__"} + unexpected_adapters = unique_adapters - expected_adapters + if unexpected_adapters: + raise ValueError( + f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}" + ) + + # deal with beam search + original_adapter_names = adapter_names[:] + if uses_beam_search: + if not isinstance(adapter_names, (list, tuple)): + raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") + # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and + # then flatten the nested list. For encoder-decoder models, this extended list should not be applied to the + # encoder part. Further below, the original argument is thus restored for the encoder. + adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) + + for module in self.modules(): + if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + if uses_beam_search and hasattr(self.model, "get_encoder"): + # For encoder-decoder models, even when applying beam search, the encoder part of the model should not use + # the extended adapter_names. This is because the encoder still uses the original, non-extended samples. + for module in self.model.get_encoder().modules(): + if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): + # Add another hook to overwrite the kwargs with the original adapter names -- this is easier than + # trying to exclude the encoder. + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=original_adapter_names) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + yield + + for handle in hook_handles: + handle.remove() + + def _check_merge_allowed(self): + """Verify that the configuration supports merging. + + Currently gptq quantization and replicated layers do not support merging. + """ + super()._check_merge_allowed() + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge LORA layers when the model is gptq quantized") + if self.peft_config.get("layer_replication"): + raise ValueError("Cannot merge LORA layers when base model layers are replicated") + + def _prepare_adapter_config(self, peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] in self.target_module_mapping: + peft_config.target_modules = set(self.target_module_mapping[model_config["model_type"]]) + elif not peft_config.target_parameters: + raise ValueError("Please specify `target_modules` or `target_parameters`in `peft_config`") + return peft_config + + def _check_add_weighted_adapter( + self, adapters: list[str], combination_type: str, svd_rank: int | None + ) -> tuple[str, int, str]: + """ + Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying + model. + """ + for adapter in adapters: + if adapter not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter} does not exist") + + for adapter in adapters: + if self.peft_config[adapter].target_parameters: + raise ValueError( + f"add_weighted_adapter does not support targeting nn.Parameter (problematic adapter '{adapter}')" + ) + + # If more than one of the adapters targets the same module with modules_to_save, raise an error, as these + # modules cannot be merged. First, find the ModulesToSaveWrapper instances in the model, then check if they + # have modules for the adapters to be merged. + modules_to_save_wrappers = [module for module in self.modules() if isinstance(module, ModulesToSaveWrapper)] + problematic_wrappers = [ + wrapper + for wrapper in modules_to_save_wrappers + if sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1 + ] + if problematic_wrappers: + raise ValueError( + "Cannot add weighted adapters if they target the same module with modules_to_save, but found " + f"{len(problematic_wrappers)} such instance(s)." + ) + + # if there is only one adapter, we can only use linear merging + combination_type = "linear" if len(adapters) == 1 else combination_type + + adapters_ranks: list[int] = [ + # When allocating tensors for the new adapter, we need the maximum possible rank to not overflow + config.r if not config.rank_pattern else max(config.r, *config.rank_pattern.values()) + for config in (self.peft_config[adapter] for adapter in adapters) + ] + + if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"): + # all adapters ranks should be same, new rank is just this value + if len(set(adapters_ranks)) != 1: + raise ValueError( + "All adapters must have the same r value when using combination_type linear, ties, dare_ties or " + "dare_linear." + ) + new_rank = adapters_ranks[0] + elif combination_type == "cat": + # adapters ranks may be different, new rank is sum of all ranks + # be careful, because output adapter rank may be really big if mixing a lot of adapters + new_rank = sum(adapters_ranks) + elif combination_type.endswith("svd"): + # new rank is the max of all ranks of the adapters if not provided + new_rank = svd_rank or max(adapters_ranks) + else: + raise ValueError(f"Invalid combination_type: {combination_type}") + + target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters] + if not target_module_types: + raise ValueError(f"Found no adapter matching the names in {adapters}") + if len(set(target_module_types)) > 1: + raise ValueError( + "all adapter configs should follow the same target modules type. " + "Combining adapters with `target_modules` type being a mix of list/set and string is not supported." + ) + + if target_module_types[0] is str: + new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters) + elif target_module_types[0] is set: + new_target_modules = reduce( + operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters) + ) + else: + raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules") + + return combination_type, new_rank, new_target_modules + + def add_weighted_adapter( + self, + adapters: list[str], + weights: list[float], + adapter_name: str, + combination_type: str = "svd", + svd_rank: int | None = None, + svd_clamp: int | None = None, + svd_full_matrices: bool = True, + svd_driver: str | None = None, + density: float | None = None, + majority_sign_method: Literal["total", "frequency"] = "total", + ) -> None: + """ + This method adds a new adapter by merging the given adapters with the given weights. + + When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to + the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM + errors. + + Args: + adapters (`list`): + List of adapter names to be merged. + weights (`list`): + List of weights for each adapter. Weights can be positive or negative, allowing for both addition and + subtraction of adapter effects. + adapter_name (`str`): + Name of the new adapter. + combination_type (`str`): + The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, + `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat` + combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the + mixed adapter may be too big and result in OOM errors). + svd_rank (`int`, *optional*): + Rank of output adapter for svd. If None provided, will use max rank of merging adapters. + svd_clamp (`float`, *optional*): + A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform + clamping. Defaults to None. + svd_full_matrices (`bool`, *optional*): + Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned + tensors U and Vh. Defaults to True. + svd_driver (`str`, *optional*): + Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be + one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd` + documentation. Defaults to None. + density (`float`, *optional*): + Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used + with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, + `magnintude_prune`, `magnitude_prune_svd`] + majority_sign_method (`str`): + The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values. + Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`] + """ + + if adapter_name in list(self.peft_config.keys()): + return + + combination_type, new_rank, new_target_modules = self._check_add_weighted_adapter( + adapters=adapters, + combination_type=combination_type, + svd_rank=svd_rank, + ) + + self.peft_config[adapter_name] = replace( + self.peft_config[adapters[0]], + r=new_rank, + lora_alpha=new_rank, + target_modules=new_target_modules, + alpha_pattern={}, + rank_pattern={}, + ) + self.inject_adapter(self.model, adapter_name) + + # Do we really need that? + _freeze_adapter(self.model, adapter_name) + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, LoraLayer): + if adapter_name in target.lora_A: + target_lora_A = target.lora_A[adapter_name].weight + target_lora_B = target.lora_B[adapter_name].weight + elif adapter_name in target.lora_embedding_A: + target_lora_A = target.lora_embedding_A[adapter_name] + target_lora_B = target.lora_embedding_B[adapter_name] + else: + continue + + target_lora_A.data = target_lora_A.data * 0.0 + target_lora_B.data = target_lora_B.data * 0.0 + if combination_type == "cat": + loras_A, loras_B = [], [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter]) + loras_B.append(current_adapter_lora_B.data) + + if len(loras_A) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.") + loras_A = torch.cat(loras_A, dim=0) + loras_B = torch.cat(loras_B, dim=1) + target_lora_A.data[: loras_A.shape[0], :] = loras_A + target_lora_B.data[:, : loras_B.shape[1]] = loras_B + elif combination_type in [ + "svd", + "ties_svd", + "dare_linear_svd", + "dare_ties_svd", + "magnitude_prune_svd", + ]: + target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter( + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + svd_clamp, + full_matrices=svd_full_matrices, + driver=svd_driver, + ) + elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]: + target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter( + combination_type, adapters, weights, target, density, majority_sign_method + ) + + def _svd_generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + clamp=None, + full_matrices=True, + driver=None, + ): + valid_adapters = [] + valid_weights = [] + is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters) + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A or adapter in target.lora_embedding_A: + valid_adapters.append(adapter) + valid_weights.append(weight * target.scaling[adapter]) + + # if no valid adapter, nothing to do + if len(valid_adapters) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on Github.") + delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters] + valid_weights = torch.tensor(valid_weights).to(delta_weight[0].device) + if combination_type == "svd": + delta_weight = task_arithmetic(delta_weight, valid_weights) + elif combination_type == "ties_svd": + delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear_svd": + delta_weight = dare_linear(delta_weight, valid_weights, density) + elif combination_type == "dare_ties_svd": + delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune_svd": + delta_weight = magnitude_prune(delta_weight, valid_weights, density) + else: + raise ValueError(f"Invalid value passed to combination type: {combination_type}") + + conv2d = isinstance(target, Conv2d) + if conv2d: + conv2d_1x1 = target.weight.size()[2:4] == (1, 1) + if not conv2d_1x1: + delta_weight = delta_weight.flatten(start_dim=1) + else: + delta_weight = delta_weight.squeeze() + if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding: + delta_weight = delta_weight.T + + # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131 + U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver) + U = U[:, :new_rank] + S = S[:new_rank] + U = U @ torch.diag(S) + Vh = Vh[:new_rank, :] + if clamp is not None: + dist = torch.cat([U.flatten(), Vh.flatten()]) + hi_val = torch.quantile(dist, clamp) + low_val = -hi_val + U = U.clamp(low_val, hi_val) + Vh = Vh.clamp(low_val, hi_val) + if conv2d: + U = U.reshape(target_lora_B.data.shape) + Vh = Vh.reshape(target_lora_A.data.shape) + return Vh, U + + def _generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + target, + density, + majority_sign_method, + ): + # account weights for LoRA A and B layers. + valid_weights = [] + lora_A_deltas = [] + lora_B_deltas = [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + # Support negative weights: take absolute value for sqrt, then apply sign + weight_with_scaling = weight * target.scaling[adapter] + sign = 1 if weight_with_scaling >= 0 else -1 + valid_weights.append(sign * math.sqrt(abs(weight_with_scaling))) + lora_A_deltas.append(current_adapter_lora_A.data) + lora_B_deltas.append(current_adapter_lora_B.data) + valid_weights = torch.tensor(valid_weights).to(lora_A_deltas[0].device) + lora_deltas = [lora_A_deltas, lora_B_deltas] + dtype = lora_A_deltas[0].dtype + for i, task_tensors in enumerate(lora_deltas): + if combination_type == "linear": + lora_deltas[i] = task_arithmetic(task_tensors, valid_weights) + elif combination_type == "ties": + lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear": + lora_deltas[i] = dare_linear(task_tensors, valid_weights, density) + elif combination_type == "dare_ties": + lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune": + lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density) + else: + raise ValueError("Invalid combination type") + lora_deltas = [delta.to(dtype) for delta in lora_deltas] + return lora_deltas + + def subtract_mutated_init(self, output_state_dict: dict[str, torch.Tensor], adapter_name: str, kwargs=None): + """ + This function can calculate the updates of the PiSSA/CorDA/OLoRA by comparing the parameters of the + PiSSA/CorDA/OLoRA adapter in `output_state_dict` with the initial values of PiSSA/CorDA/OLoRA in + `adapter_name`, thus converting PiSSA/CorDA/OLoRA to LoRA. + """ + for name, param in self.model.named_parameters(): + if ( + param.data.dtype != torch.float32 + and param.data.dtype != torch.float16 + and param.data.dtype != torch.bfloat16 + ) and adapter_name.startswith("pissa"): + warnings.warn( + r"Note that Quant(W_res) + AB != Quant(W) + \Delta(AB); " + "the converted LoRA, when combined with W or Quant(W), may introduce a certain gap in the fine-tuned model. " + "Therefore, we recommend directly using the Quant(W_res) in conjunction with the PiSSA adapter. " + ) + mutated_init_state_dict = get_peft_model_state_dict( + self, + state_dict=kwargs.get("state_dict", None), + adapter_name=adapter_name, + ) + tensors_lora = {} + for name in output_state_dict.keys(): + ## W = W^{res} + A_0 \times B_0, + ## W + \Delta W = W^{res} + A \times B, + ## \Delta W = A \times B - A_0 \times B_0 = [A | A_0] \times [B | -B_0]^T = A'B'. + if "lora_A" in name: + tensors_lora[name] = torch.cat( + [output_state_dict[name], mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=0 + ) + elif "lora_B" in name: + tensors_lora[name] = torch.cat( + [output_state_dict[name], -mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=1 + ) + + return tensors_lora diff --git a/peft/src/peft/tuners/lora/torchao.py b/peft/src/peft/tuners/lora/torchao.py new file mode 100644 index 0000000000000000000000000000000000000000..5e7240a053502bd2be450fd87187c756eeb0c17c --- /dev/null +++ b/peft/src/peft/tuners/lora/torchao.py @@ -0,0 +1,156 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional + +import torch + +# from torch import nn +from peft.import_utils import is_torchao_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .config import LoraConfig +from .layer import Linear + + +class TorchaoLoraLinear(Linear): + """LoRA layer implementation for Linear layers using torchao data""" + + def __init__(self, *args, get_apply_tensor_subclass, **kwargs): + # this is not strictly necessary, as kwargs are stored either way, but we want to error early if + # get_apply_tensor_subclass is missing. + if kwargs.get("lora_bias", False): + raise ValueError(f"{self.__class__.__name__} does not support lora_bias yet, set it to False") + + super().__init__(*args, **kwargs) + self.get_apply_tensor_subclass = get_apply_tensor_subclass + self._check_dtype_supported() + + def _check_dtype_supported(self): + # TODO: Not required once int4_weight_only is properly supported by torchao + base_layer = self.get_base_layer() + weight = base_layer.weight + # pytest tests/test_gpu_examples.py::PeftTorchaoGPUTests::test_causal_lm_training_single_gpu_torchao_0_int8_weight_only + if ( + # torchao 0.7.0+ + (hasattr(weight, "tensor_impl") and (weight.tensor_impl.data.dtype != torch.int8)) + or + # torchao < 0.7.0 + (hasattr(weight, "layout_tensor") and (weight.layout_tensor.data.dtype != torch.int8)) + ): + raise ValueError(f"{type(self).__name__} only supports int8 weights for now.") + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + from torchao import quantize_ + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + self._check_dtype_supported() + + base_layer = self.get_base_layer() + weight = base_layer.weight + + for active_adapter in adapter_names: + try: + weight = weight.dequantize() + except NotImplementedError as exc: + msg = ( + f"Weights of type {type(weight).__name__} do not support dequantization (yet), which is needed to " + "support merging." + ) + raise NotImplementedError(msg) from exc + + if safe_merge and not torch.isfinite(weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + weight += self.get_delta_weight(active_adapter) + # TODO: once (if) torchao supports directly mutating the data, use that instead. + del base_layer.weight + base_layer.weight = weight + quantize_(base_layer, self.get_apply_tensor_subclass()) + del weight + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + from torchao import quantize_ + + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + + base_layer = self.get_base_layer() + weight = base_layer.weight + try: + weight = weight.dequantize() + except NotImplementedError as exc: + msg = ( + f"Weights of type {type(weight).__name__} do not support dequantization (yet), which is needed to " + "support unmerging." + ) + raise NotImplementedError(msg) from exc + + weight -= self.get_delta_weight(active_adapter) + # We go through a dummy module because overriding the weight.data does not work, the tensor retains the old + # data. Therefore, we need to go through quantize_, which takes a module as input, and we need to delete and + # re-assign the weight. + # TODO: once (if) torchao supports directly mutating the data, use that instead. + del base_layer.weight + base_layer.weight = weight + quantize_(base_layer, self.get_apply_tensor_subclass()) + del weight + + def __repr__(self) -> str: + rep = super().__repr__() + return rep.replace("lora.Linear", f"lora.{self.__class__.__name__}") + + +def dispatch_torchao( + target: torch.nn.Module, + adapter_name: str, + lora_config: LoraConfig, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if not hasattr(target_base_layer, "weight"): + return new_module + + if not is_torchao_available(): + return new_module + + from torchao.dtypes import AffineQuantizedTensor + from torchao.quantization import LinearActivationQuantizedTensor + + if isinstance(target_base_layer.weight, (AffineQuantizedTensor, LinearActivationQuantizedTensor)): + new_module = TorchaoLoraLinear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/lora/tp_layer.py b/peft/src/peft/tuners/lora/tp_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..7edd4d3a6ff1f4b7dda6ea62a46bb3c743c3ee12 --- /dev/null +++ b/peft/src/peft/tuners/lora/tp_layer.py @@ -0,0 +1,350 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import importlib +import math +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.init as init + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils import transpose +from peft.utils.integrations import gather_params_ctx + +from .layer import LoraLayer + + +class LoraParallelLinear(nn.Module, LoraLayer): + """ + When the target layer parallel_linear is RowParallelLinear, in order to keep the input and output shapes + consistent, we need to split the lora matrix A into rows, and the lora_B at this time should be a complete linear + layer; In the same way, when the target layer is ColumnParallelLinear, we perform column segmentation on lora_B, + while lora_A is still a complete linear layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + backend, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + is_target_conv_1d_layer: bool = False, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ): + if lora_bias: + raise ValueError(f"{self.__class__.__name__} does not support lora_bias yet, set it to False") + + super().__init__() + LoraLayer.__init__(self, base_layer=base_layer, **kwargs) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self.backend = backend + self.is_parallel_a = isinstance(base_layer, backend.RowParallelLinear) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + + megatron_config = kwargs["megatron_config"] + parallel_linear_kwargs = {"megatron_config": megatron_config} + init_method = init.xavier_normal_ + if hasattr(megatron_config, "init_method"): + init_method = megatron_config.init_method + input_is_parallel = True + gather_output = False + if self.is_parallel_a: + input_is_parallel = base_layer.input_is_parallel + else: + gather_output = base_layer.gather_output + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + init_method=init_method, + input_is_parallel=input_is_parallel, + gather_output=gather_output, + **parallel_linear_kwargs, + ) + + if is_target_conv_1d_layer: + raise ValueError( + f"{self.__class__.__name__} does not support target_conv_1d_layer yet, please set it to False" + ) + self.is_target_conv_1d_layer = False + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora=False, + init_method=init.xavier_normal_, + input_is_parallel=True, + gather_output=False, + inference_mode: bool = False, + **parallel_linear_kwargs, + ): + # collect the kwargs + kwargs = locals().copy() + del kwargs["self"] + + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + + megatron_config = parallel_linear_kwargs["megatron_config"] + # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow + megatron_config.params_dtype = torch.float32 + if self.is_parallel_a: + lora_a = self.backend.RowParallelLinear( + input_size=self.in_features, + output_size=r, + bias=False, + input_is_parallel=input_is_parallel, + skip_bias_add=True, + init_method=init_method, + config=megatron_config, + ) + lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32) + else: + lora_a = nn.Linear(in_features=self.in_features, out_features=r, bias=False, dtype=torch.float32) + lora_b = self.backend.ColumnParallelLinear( + input_size=r, + output_size=self.out_features, + bias=False, + gather_output=gather_output, + init_method=init_method, + config=megatron_config, + ) + self.lora_A[adapter_name] = lora_a + self.lora_B[adapter_name] = lora_b + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + self.use_dora[adapter_name] = use_dora + + # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed + if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"): + with gather_params_ctx(self.get_base_layer().weight): + self.pissa_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.startswith("corda"): + with gather_params_ctx(self.get_base_layer().weight): + self.corda_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": + with gather_params_ctx(self.get_base_layer().weight): + self.olora_init(adapter_name) + elif init_lora_weights == "loftq": + with gather_params_ctx(self.get_base_layer().weight): + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + # call this before dora_init + self._move_adapter_to_device_of_base_layer(adapter_name) + + if adapter_name in self.lora_variant: + self.lora_variant[adapter_name].init(self, **kwargs) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any): + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + # If weight is used for matrix multiplication here, the final aggregation operation of the original + # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the + # output of the original parallel_linear layer. + if self.disable_adapters: + if self.merged: + self.unmerge() + result, bias = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + raise ValueError(f"{self.__class__.__name__} does not support mixed_batch_forward yet.") + elif self.merged: + result, bias = self.base_layer(x, *args, **kwargs) + else: + result, bias = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = self._cast_input_dtype(x, lora_A.weight.dtype) + result = result + lora_B(lora_A(dropout(x))) * scaling + + result = result.to(torch_result_dtype) + return result, bias + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter) + orig_weights = orig_weights + delta_weight + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data = base_layer.weight.data + delta_weight + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + weight = self.get_base_layer().weight + delta_weight = self.get_delta_weight(active_adapter) + weight.data -= delta_weight + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_B[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_megatron( + target: torch.nn.Module, + adapter_name: str, + lora_config, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if lora_config.megatron_config: + megatron_core = importlib.import_module(lora_config.megatron_core) + else: + megatron_core = None + + if megatron_core and isinstance( + target_base_layer, + (megatron_core.tensor_parallel.ColumnParallelLinear, megatron_core.tensor_parallel.RowParallelLinear), + ): + megatron_kwargs = kwargs.copy() + megatron_config = lora_config.megatron_config + if isinstance(megatron_config, dict): + transformer_config_class = megatron_core.transformer.transformer_config.TransformerConfig + megatron_config = transformer_config_class(**lora_config.megatron_config) + megatron_kwargs["megatron_config"] = megatron_config + if megatron_kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `ColumnParallelLinear` " + "or `RowParallelLinear`. " + "Setting fan_in_fan_out to False." + ) + megatron_kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + new_module = LoraParallelLinear( + base_layer=target, adapter_name=adapter_name, backend=megatron_core.tensor_parallel, **megatron_kwargs + ) + + return new_module diff --git a/peft/src/peft/tuners/lora/variants.py b/peft/src/peft/tuners/lora/variants.py new file mode 100644 index 0000000000000000000000000000000000000000..54cdfae30c586a92b365d522a3182dc1a10bb137 --- /dev/null +++ b/peft/src/peft/tuners/lora/variants.py @@ -0,0 +1,765 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import collections +import warnings +from typing import Any, Optional + +import torch +from accelerate.utils.imports import is_xpu_available +from torch import nn + +from peft.utils.other import transpose + +from .arrow import ArrowLoraLinearLayer +from .config import PeftConfig +from .dora import DoraConv1dLayer, DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer +from .layer import Conv1d, Conv2d, Conv3d, Embedding, Linear, LoraVariant, _ConvNd + + +class ArrowLinearVariant(LoraVariant): + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs): + """ + Initialise the ArrowLoraLinearLayer() inside lora_arrow. lora_arrow is nn.ModuleDict(), serving as a container + for ArrowLoraLinearLayer(). A layer of the base model with LoRA adapter loaded on it will be like: + ---------------------------------------------------- + (qkv_proj): lora.Linear4bit or lora.Linear( + (base_layer): Linear4bit or Linear (lora_dropout): ModuleDict( ... ) (lora_A): ModuleDict( ... ) + (lora_B): ModuleDict( ... ) (lora_embedding_A): ParameterDict( ... ) (lora_embedding_B): ParameterDict( + ... ) (lora_magnitude_vector): ModuleDict( ... ) (lora_arrow): ModuleDict( + (arrow_router): ArrowLoraLinearLayer() ) + ) + ---------------------------------------------------- + + Args: + module (Linear): LoRA Layer of the model, containing base_layer, lora_A, lora_B, etc. + adapter_name (str): name of the adapter that will be put in lora_arrow. + The adapter_name is "arrow_router" by default, set in create_arrow_model() in ./arrow.py + """ + # Checking for arrow necessary config + arrow_config = kwargs.get("arrow_config") + if arrow_config is None: + raise ValueError("ArrowLinearVariant.init() did not receive an arrow_config") + + # 1-a) build the ArrowLoRALayer + arrow_layer = ArrowLoraLinearLayer( + in_features=module.in_features, + arrow_config=arrow_config, + ).to(module.weight.device) + + # 1-b) register a container if it doesn’t exist yet + if not hasattr(module, "lora_arrow"): + module.lora_arrow = nn.ModuleDict() + + module.lora_arrow[adapter_name] = arrow_layer + + @staticmethod + def forward( + module: Linear, + *, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + """ + Parameters mirror those in PEFT’s `LoraVariant.forward`. Called every time the host Linear does a fwd pass. + + build_prototypes() and gen_know_sub() should run only once before routing. Both are implemented in + ArrowLoraLinearLayer (see ./arrow.py). They are lazily invoked in the forward pass below. Attributes of + ArrowLoraLinearLayer() class ensure they execute only a single time. + + Args: + module (Linear): LoRA Layer of the model + active_adapter (str): name of the arrow route, which should be active to perform arrow. + x (torch.Tensor): input to the layer + result (torch.Tensor): output of the base layer. + + Return value: + output of the base model + delta weight computed by arrow layer. + """ + arrow = module.lora_arrow[active_adapter] # ArrowLoraLinearLayer + # Apply GenKnowSub the 1st time if applcable. By calling arrow/on_adapter_change(), + # gen_know_sub() is redone for newly added adapters after arrow.create_arrow_model(). + arrow.gen_know_sub(module.lora_A, module.lora_B) + # lazily build prototypes the 1st time after GenKnowSub. By calling arrow/on_adapter_change(), + # build_prototypes() is redone for newly added adapters after arrow.create_arrow_model(). + arrow.build_prototypes(module.lora_A, module.lora_B) + + # A forward path of ArrowLoraLinearLayer is called so routing performs. + # Accept and ignore extra variant kwargs (e.g., 'alora_offsets') for compatibility + delta = arrow( + x, + lora_A=module.lora_A, + lora_B=module.lora_B, + dropout=module.lora_dropout[active_adapter], + scaling=module.scaling, + ) + return result + delta + + """ + Since Arrow is a Mixture-of-Experts (MoE) approach, merging adapters is not meaningful or even possible: for each + token, the top-k LoRA experts are dynamically selected and routed. Because of this per-token routing, there is no + single set of weights that can represent a merged adapter. + """ + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise RuntimeError("Cannot merge an active Arrow router adapter. Remove it first.") + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + raise RuntimeError("Cannot merge an active Arrow router adapter. Remove it first.") + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise RuntimeError("Cannot unmerge an active Arrow router adapter. Remove it first.") + + +class DoraLinearVariant(LoraVariant): + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: + if not module.lora_magnitude_vector: + # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters + module.adapter_layer_names = module.adapter_layer_names[:] + ("lora_magnitude_vector",) + + dora_layer = DoraLinearLayer(fan_in_fan_out=getattr(module, "fan_in_fan_out", False)) + lora_A = module.lora_A[adapter_name].weight + lora_B = module.lora_B[adapter_name].weight + place_on_cpu = module.ephemeral_gpu_offload and (lora_A.device.type == "cpu" or lora_B.device.type == "cpu") + if module.ephemeral_gpu_offload: + if lora_A.device.type in ["cuda", "xpu"]: + lora_B = lora_B.to(lora_A.device) + else: + if lora_B.device.type not in ["cuda", "xpu"]: + if is_xpu_available(): + lora_B = lora_B.to("xpu") + else: + lora_B = lora_B.to("cuda") + lora_A = lora_A.to(lora_B.device) + scaling = module.scaling[adapter_name] + dora_layer.update_layer( + base_layer=module.get_base_layer(), + lora_A=lora_A, + lora_B=lora_B, + scaling=scaling, + place_on_cpu=place_on_cpu, + ) + module.lora_magnitude_vector[adapter_name] = dora_layer + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + + # since delta_weight already includes scaling, set it to 1 here + weight_norm = ( + module.lora_magnitude_vector[active_adapter] + .get_weight_norm(orig_weight, transpose(delta_weight, module.fan_in_fan_out), scaling=1) + .detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + dora_factor = transpose(dora_factor.view(-1, 1), module.fan_in_fan_out) + new_weight = dora_factor * (orig_weight + delta_weight) + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + weight_norm = ( + module.lora_magnitude_vector[active_adapter] + .get_weight_norm(orig_weight, transpose(delta_weight, module.fan_in_fan_out), scaling=1) + .detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + dora_factor = transpose(dora_factor.view(-1, 1), module.fan_in_fan_out) + new_weight = dora_factor * (orig_weight.data + delta_weight) + new_weight = new_weight.to(orig_dtype) + orig_weight.data = new_weight + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + weight_norm = module._cache_pop(f"{active_adapter}-weight_norm") + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + new_weight = orig_weight.data / dora_factor.view(-1, 1) - delta_weight + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + lora_A = module.lora_A[active_adapter] + lora_B = module.lora_B[active_adapter] + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + + if isinstance(dropout, nn.Identity) or not module.training: + base_result = result + else: + x = dropout(x) + base_result = None + + result = result + module.lora_magnitude_vector[active_adapter]( + x, + lora_A=lora_A, + lora_B=lora_B, + scaling=scaling, + base_layer=module.get_base_layer(), + base_result=base_result, + ) + return result + + +class DoraEmbeddingVariant(DoraLinearVariant): + @staticmethod + def init(module: Embedding, adapter_name: str, **kwargs: Any) -> None: + if module.lora_magnitude_vector is None: + # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters + module.adapter_layer_names = module.adapter_layer_names[:] + ("lora_magnitude_vector",) + + dora_layer = DoraEmbeddingLayer(fan_in_fan_out=True) + lora_embedding_A = module.lora_embedding_A[adapter_name] + lora_embedding_B = module.lora_embedding_B[adapter_name] + scaling = module.scaling[adapter_name] + dora_layer.update_layer( + base_layer=module.get_base_layer(), lora_A=lora_embedding_A, lora_B=lora_embedding_B, scaling=scaling + ) + module.lora_magnitude_vector[adapter_name] = dora_layer + + @staticmethod + def merge_safe(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + + # since delta_weight already includes scaling, set it to 1 here + weight_norm = ( + module.lora_magnitude_vector[active_adapter] + .get_weight_norm(orig_weight, delta_weight.T, scaling=1) + .detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + dora_factor = dora_factor.view(1, -1) + new_weight = dora_factor * (orig_weight + delta_weight) + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def merge_unsafe(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> None: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + weight_norm = ( + module.lora_magnitude_vector[active_adapter] + .get_weight_norm(orig_weight, delta_weight.T, scaling=1) + .detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + dora_factor = dora_factor.view(1, -1) + new_weight = dora_factor * (orig_weight.data + delta_weight) + new_weight = new_weight.to(orig_dtype) + orig_weight.data = new_weight + + @staticmethod + def unmerge(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + weight_norm = module._cache_pop(f"{active_adapter}-weight_norm") + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + new_weight = orig_weight.data / dora_factor.view(1, -1) - delta_weight + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def forward( + module: Embedding, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + embedding_A = module.lora_embedding_A[active_adapter].T + embedding_B = module.lora_embedding_B[active_adapter].T + scaling = module.scaling[active_adapter] + + mag_norm_scale, dora_result = module.lora_magnitude_vector[active_adapter]( + x, + lora_A=embedding_A, + lora_B=embedding_B, + scaling=scaling, + base_layer=module.get_base_layer(), + embed_fn=module._embed, + ) + result = mag_norm_scale * result + dora_result + return result + + +class _DoraConvNdVariant(LoraVariant): + @staticmethod + def init_convd_variant(module: _ConvNd, adapter_name: str, dora_layer: nn.Module) -> None: + if module.lora_magnitude_vector is None: + # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters + module.adapter_layer_names = module.adapter_layer_names[:] + ("lora_magnitude_vector",) + + lora_A = module.lora_A[adapter_name].weight + lora_B = module.lora_B[adapter_name].weight + scaling = module.scaling[adapter_name] + dora_layer.update_layer(base_layer=module.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling) + module.lora_magnitude_vector[adapter_name] = dora_layer + + @staticmethod + def merge_safe(module: _ConvNd, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + + # since delta_weight already includes scaling, set it to 1 here + weight_norm = ( + module.lora_magnitude_vector[active_adapter].get_weight_norm(orig_weight, delta_weight, scaling=1).detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + new_weight = dora_factor.view(*module._get_dora_factor_view()) * (orig_weight + delta_weight) + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def merge_unsafe(module: _ConvNd, active_adapter: str, orig_weight: torch.Tensor) -> None: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + # since delta_weight already includes scaling, set it to 1 here + weight_norm = ( + module.lora_magnitude_vector[active_adapter].get_weight_norm(orig_weight, delta_weight, scaling=1).detach() + ) + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + module._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + new_weight = dora_factor.view(*module._get_dora_factor_view()) * (orig_weight.data + delta_weight) + new_weight = new_weight.to(orig_dtype) + orig_weight.data = new_weight + + @staticmethod + def unmerge(module: _ConvNd, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + orig_dtype = orig_weight.dtype + delta_weight = module.get_delta_weight(active_adapter) + weight_norm = module._cache_pop(f"{active_adapter}-weight_norm") + dora_factor = module.lora_magnitude_vector[active_adapter].weight / weight_norm + new_weight = orig_weight.data / dora_factor.view(*module._get_dora_factor_view()) - delta_weight + new_weight = new_weight.to(orig_dtype) + return new_weight + + @staticmethod + def forward( + module: _ConvNd, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + lora_A = module.lora_A[active_adapter] + lora_B = module.lora_B[active_adapter] + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + + if isinstance(dropout, nn.Identity) or not module.training: + base_result = result + else: + x = dropout(x) + base_result = None + + result = result + module.lora_magnitude_vector[active_adapter]( + x, + lora_A=lora_A, + lora_B=lora_B, + scaling=scaling, + base_layer=module.get_base_layer(), + base_result=base_result, + ) + return result + + +class DoraConv1dVariant(_DoraConvNdVariant): + @staticmethod + def init(module: Conv1d, adapter_name: str, **kwargs: Any) -> None: + dora_layer = DoraConv1dLayer(fan_in_fan_out=False) + _DoraConvNdVariant.init_convd_variant(module, adapter_name, dora_layer=dora_layer) + + +class DoraConv2dVariant(_DoraConvNdVariant): + @staticmethod + def init(module: Conv2d, adapter_name: str, **kwargs: Any) -> None: + dora_layer = DoraConv2dLayer(fan_in_fan_out=False) + _DoraConvNdVariant.init_convd_variant(module, adapter_name, dora_layer=dora_layer) + + +class DoraConv3dVariant(_DoraConvNdVariant): + @staticmethod + def init(module: Conv3d, adapter_name: str, **kwargs: Any) -> None: + dora_layer = DoraConv3dLayer(fan_in_fan_out=False) + _DoraConvNdVariant.init_convd_variant(module, adapter_name, dora_layer=dora_layer) + + +class QALoraLinearVariant(LoraVariant): + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: + """ + Initializes QALoRA specific parameters for a given adapter. + + Args: + module (Linear): The linear module to be adapted. + adapter_name (str): The name of the adapter. + **kwargs: Additional keyword arguments. + qalora_group_size (int): The size of groups for pooling. This is expected to be passed. + """ + if "qalora_group_size" not in kwargs: + raise ValueError( + "`use_qalora=True` requires 'qalora_group_size' to be provided in kwargs." + " Please ensure it is passed from the LoraConfig." + ) + + if module.in_features is not None and module.in_features % kwargs["qalora_group_size"] != 0: + raise ValueError( + f"`use_qalora=True` requires `module.in_features` ({module.in_features}) to be" + f"divisible by 'qalora_group_size' ({kwargs['qalora_group_size']})" + ) + qalora_group_size = kwargs["qalora_group_size"] + + if "qalora_group_size" not in module.other_param_names: + module.other_param_names = module.other_param_names + ("qalora_group_size",) + + if not hasattr(module, "qalora_group_size"): + module.qalora_group_size = {} + module.qalora_group_size[adapter_name] = qalora_group_size + + old_lora_A_layer = module.lora_A[adapter_name] + r = old_lora_A_layer.out_features + device = old_lora_A_layer.weight.device + dtype = old_lora_A_layer.weight.dtype + + new_lora_A_layer = nn.Linear( + old_lora_A_layer.in_features // module.qalora_group_size[adapter_name], + r, + bias=False, + device=device, + dtype=dtype, + ) + module.lora_A[adapter_name] = new_lora_A_layer + + @staticmethod + def get_delta_weight(module: Linear, active_adapter: str) -> torch.Tensor: + raise NotImplementedError("QALoRA for GPTQ layers does not support 'get_delta_weight'.") + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("QALoRA for GPTQ layers does not support 'safe_merge'.") + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + raise NotImplementedError("QALoRA for GPTQ layers does not support 'merge_unsafe'.") + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("QALoRA for GPTQ layers does not support 'unmerge'.") + + @staticmethod + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + lora_A_weight = module.lora_A[active_adapter].weight + lora_B_weight = module.lora_B[active_adapter].weight + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + group_size = module.qalora_group_size[active_adapter] + + x_dropped = dropout(x) if module.training and not isinstance(dropout, nn.Identity) else x + orig_shape = x_dropped.shape + + # Reshape to 2D + if len(orig_shape) > 2: + x_flat = x_dropped.view(-1, module.in_features) + else: + x_flat = x_dropped + + batch_size, in_features = x_flat.shape + pooled_features = in_features // group_size + + x_pooled = x_flat.view(batch_size, pooled_features, group_size).mean(dim=2) + + x_pooled_scaled = x_pooled * pooled_features + + # LoRA computation + delta = x_pooled_scaled @ lora_A_weight.t() @ lora_B_weight.t() * scaling + + # Reshape back + if len(orig_shape) > 2: + delta = delta.view(orig_shape[:-1] + (delta.size(-1),)) + + return result + delta + + +class ALoraLinearVariant(LoraVariant): + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: + pass + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("aLoRA does not support safe merging.") + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + raise NotImplementedError("aLoRA does not support merging.") + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("aLoRA does not support unmerging.") + + @staticmethod + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + alora_offsets = kwargs.get("alora_offsets", None) + lora_A = module.lora_A[active_adapter] + lora_B = module.lora_B[active_adapter] + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) + result_shape = result.shape + B = result_shape[0] # batch + if len(result_shape) == 3: + T = result_shape[1] # tokens + else: + T = 1 + D = result_shape[-1] # dimensions + Dx = x.shape[-1] + device = result.device + if alora_offsets is None: # use base model only, but ensure 0 gradient + mask = torch.zeros((B, T), dtype=torch.bool) + else: + # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # input. As a result, the weights should not be activated anywhere (equivalent to base model). + # Convert None -> 0 and clip to T + offsets = torch.tensor( + [0 if o is None else min(int(o), T) for o in alora_offsets], + device=device, + dtype=torch.long, + ) + # Mask True on the last `offsets[i]` positions for each row i + pos = torch.arange(T, device=device).unsqueeze(0) # [1, T] + mask = pos >= (T - offsets).unsqueeze(1) + + # Flatten for vectorization + x_flat = x.view(-1, Dx) + res_flat = result.view(-1, D) + mask_flat = mask.view(-1) + + # Compute adapter on the selected tokens only + res_flat[mask_flat] += lora_B(lora_A(dropout(x_flat[mask_flat]))) * scaling + return result + + +def calculate_alora_offsets( + peft_config: PeftConfig, active_adapter: str, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None +) -> list[int]: + """ + This is a helper function for Activated LoRA (aLoRA) that searches each input token sequence for the last occurence + of the appropriate "alora_invocation_tokens" invocation sequence. The calculated alora_offset is the location of + the *start* of the invocation tokens, counting backward from the end (will therefore always be >= + len(alora_invocation_tokens). If adapter_names is passed, then each input uses the appropriate invocation sequence + for the specified adapter for that row. Logic is provided to handle mixed collections of adapters for which not all + are aLoRAs (e.g. some base model, some LoRA). + """ + if input_ids is None: + return [] + + batch_size = input_ids.shape[0] + alora_offsets = [None] * batch_size + + cached_invocation_tensors = {} + adapters_to_process_indices = collections.defaultdict(list) + + for i in range(batch_size): + current_adapter_name = adapter_names[i] if adapter_names and i < len(adapter_names) else active_adapter + + if current_adapter_name == "__base__": + alora_offsets[i] = None + continue + + if current_adapter_name not in peft_config: + warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}.") + alora_offsets[i] = None + continue + + current_peft_config = peft_config[current_adapter_name] + + invocation_tokens = getattr(current_peft_config, "alora_invocation_tokens", None) + if invocation_tokens is None: + alora_offsets[i] = None # Not an aLoRA adapter or wrong type + continue + + if current_adapter_name not in cached_invocation_tensors: + cached_invocation_tensors[current_adapter_name] = torch.tensor( + invocation_tokens, dtype=torch.long, device=input_ids.device + ) + + adapters_to_process_indices[current_adapter_name].append(i) + + for adapter_name_to_process, indices in adapters_to_process_indices.items(): + current_invocation_ids_tensor = cached_invocation_tensors[adapter_name_to_process] + invocation_len = len(current_invocation_ids_tensor) + + for i in indices: + sequence = input_ids[i] + seq_len = len(sequence) + best_match_start_idx = -1 + + possible_starts = (sequence == current_invocation_ids_tensor[0]).nonzero(as_tuple=True)[0] + + for start_idx_tensor in possible_starts: + idx = start_idx_tensor.item() + if idx + invocation_len <= seq_len: + if torch.equal(sequence[idx : idx + invocation_len], current_invocation_ids_tensor): + if idx > best_match_start_idx: + best_match_start_idx = idx + + if best_match_start_idx != -1: + offset_val = seq_len - best_match_start_idx + alora_offsets[i] = offset_val if offset_val > 0 else None + else: # Invocation sequence not found in input + alora_offsets[i] = None + return alora_offsets + + +def is_alora_relevant_in_batch(model: nn.Module, adapter_names: Optional[list[str]] = None): + """ + Helper function to determine if the current batch has any aLoRA adapters. + """ + is_alora_relevant = False + if getattr(model.active_peft_config, "alora_invocation_tokens", None): + is_alora_relevant = True + elif adapter_names: + for name in adapter_names: + if name == "__base__": + continue + config_ = model.peft_config.get(name) + if config_ and getattr(config_, "alora_invocation_tokens", None): + is_alora_relevant = True + break + + return is_alora_relevant + + +def get_alora_offsets_for_forward( + model: nn.Module, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, **kwargs +): + """ + Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the + batch contains aLoRA adapters. + """ + adapter_names_for_offset_calc = kwargs.get("adapter_names", None) + if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + # Nothing to compute + return kwargs + alora_offsets = kwargs.get("alora_offsets") + if alora_offsets is None: + if input_ids is None and inputs_embeds is not None: + warnings.warn( + "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." + ) + kwargs["alora_offsets"] = None + elif input_ids is not None: + kwargs["alora_offsets"] = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + else: + kwargs["alora_offsets"] = None + return kwargs + + +def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): + """ + Wrapper around calculate_alora_offsets, for the .generate of the model. It only calculates alora_offsets if the + batch contains aLoRA adapters. + """ + adapter_names_for_offset_calc = kwargs.get("adapter_names") + if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + # Nothing to compute + return kwargs + alora_offsets_from_kwargs = kwargs.get("alora_offsets") + if alora_offsets_from_kwargs is None: + current_input_ids = kwargs.get("input_ids") + if current_input_ids is None: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): + current_input_ids = args[0] + else: + current_input_ids = None + + if current_input_ids is not None: + if current_input_ids.ndim == 1: + current_input_ids = current_input_ids.unsqueeze(0) + calculated_offsets = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + current_input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + kwargs["alora_offsets"] = calculated_offsets + + else: + warnings.warn( + "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." + ) + + kwargs["alora_offsets"] = None + return kwargs diff --git a/peft/src/peft/tuners/lycoris_utils.py b/peft/src/peft/tuners/lycoris_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7ea7a260569cb1f16ed868b55b529e02d4a0c11a --- /dev/null +++ b/peft/src/peft/tuners/lycoris_utils.py @@ -0,0 +1,263 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from abc import abstractmethod +from dataclasses import dataclass, field +from typing import Any, Optional, Union + +import torch +import torch.nn as nn + +from peft.config import PeftConfig + +from .tuners_utils import ( + BaseTuner, + BaseTunerLayer, + _get_in_out_features, + check_adapters_to_merge, +) + + +@dataclass +class LycorisConfig(PeftConfig): + r""" + A base config for LyCORIS like adapters + """ + + rank_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " + "For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`." + ) + }, + ) + alpha_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. " + "For example, `{'^model.decoder.layers.0.encoder_attn.k_proj': 16}`." + ) + }, + ) + + +class LycorisLayer(BaseTunerLayer): + r""" + A base layer for LyCORIS like adapters + """ + + # adapter_layer_names needs to be defined on the child class + other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout") + + def __init__(self, base_layer: nn.Module) -> None: + self.base_layer = base_layer + self.r = {} + self.alpha = {} + self.scaling = {} + self.rank_dropout = {} + self.rank_dropout_scale = {} + self.module_dropout = {} + + # Tuner info + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + + in_features, out_features = _get_in_out_features(self.get_base_layer()) + self.in_features = in_features + self.out_features = out_features + + @property + @abstractmethod + def _available_adapters(self) -> set[str]: ... + + def _init_empty_weights(self, cls, *args, **kwargs) -> None: + # A helper method that allows to initialize the layer of the given class without spending time to initialize the + # model weights. The implementation is inspired by + # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used + # directly. + # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of + # omitting important logic inside that __init__. + kwargs = kwargs.copy() + final_device = kwargs.pop("device", "cpu") + cls.__init__(self, *args, device="meta", **kwargs) + self.to_empty(device=final_device) + + @abstractmethod + def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs): ... + + # TODO: refactor LoRA to use the same approach + @abstractmethod + def _get_delta_activations(self, adapter_name: str, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + """Activations added on top of the base layer output (i.e. after the base layer forward pass)""" + + @abstractmethod + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: ... + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + if safe_merge: + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + @abstractmethod + def reset_adapter_parameters(self, adapter_name: str): ... + + def set_scale(self, adapter, scale): + if adapter not in self._available_adapters: + # Ignore the case where the adapter is not in the layer + return + self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter] + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + self.scaling[active_adapter] *= scale + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + if scale is None: + self.scaling[active_adapter] = self.alpha[active_adapter] / self.r[active_adapter] + else: + self.scaling[active_adapter] /= scale + + @abstractmethod + def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs): ... + + +class LycorisTuner(BaseTuner): + r""" + A base tuner for LyCORIS like adapters + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`LoraConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + """ + + prefix: str + tuner_layer_cls = LycorisLayer + layers_mapping: dict[type[torch.nn.Module], type[LycorisLayer]] + + @abstractmethod + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LycorisLayer, nn.Module], + target_name, + parent, + current_key, + ): ... + + @classmethod + def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer: + # Find corresponding subtype of provided target module + new_module_cls = None + for subtype, target_cls in cls.layers_mapping.items(): + if ( + hasattr(target, "base_layer") + and isinstance(target.get_base_layer(), subtype) + and isinstance(target, BaseTunerLayer) + ): + # nested tuner layers are allowed + new_module_cls = target_cls + break + elif isinstance(target, subtype): + new_module_cls = target_cls + break + + # We didn't find corresponding type, so adapter for this layer is not supported + if new_module_cls is None: + supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys()) + raise ValueError( + f"Target module of type {type(target)} not supported, " + f"currently only adapters for {supported_modules} are supported" + ) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, (torch.nn.Conv2d, torch.nn.Conv1d)): + new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) + else: + supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys()) + raise ValueError( + f"Target module of type {type(target)} not supported, " + f"currently only adapters for {supported_modules} are supported" + ) + + return new_module diff --git a/peft/src/peft/tuners/miss/__init__.py b/peft/src/peft/tuners/miss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8aceeb0dc772c6db6157834de40ab2c43fe291 --- /dev/null +++ b/peft/src/peft/tuners/miss/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import MissConfig +from .layer import MissLayer, MissLinear +from .model import MissModel + + +__all__ = ["MissConfig", "MissLayer", "MissLinear", "MissModel"] + +register_peft_method(name="miss", config_cls=MissConfig, model_cls=MissModel) diff --git a/peft/src/peft/tuners/miss/config.py b/peft/src/peft/tuners/miss/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a0cef7ab34b3b35a4806c58a2e74acfbd3980b36 --- /dev/null +++ b/peft/src/peft/tuners/miss/config.py @@ -0,0 +1,140 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class MissConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`MiSSModel`]. + + Args: + r (`int`): + The rank of MiSS across different layers. It is best to set 'r' to an even number; otherwise, the default + initialization method will not work. The rank of MiSS corresponds to a low-rank decomposition along the + in_features dimension. + miss_dropout (`float`): + The dropout probability for MiSS layers. + mini_r (`int`): + The rank of MiSS corresponds to a low-rank decomposition along the out_features dimension. When you set + `init_weights=mini`, you need to set `mini_r`. Please make sure that `out_features` is divisible by + `mini_r`. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding + the output layer. If this is not specified, modules will be chosen according to the model architecture. If + the architecture is not known, an error will be raised -- in this case, you should specify the target + modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (bool | Literal["bat", "mini"]): + Different initializations correspond to different MiSS variants. By default(balance), the most efficient + and general method in MiSS will be used. 'bat': In this mode, you can enable nonlinear updates across + different shards. 'mini': In this mode, you can set a smaller rank to use fewer trainable parameters, but + it is recommended to keep `out_features % mini_r == 0`. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field( + default=64, + metadata={ + "help": "The rank of MiSS corresponds to a low-rank decomposition along the in_features dimension.", + "note": "It is best to set 'r' to an even number; otherwise, the default initialization method will not work.", + }, + ) + miss_dropout: float = field(default=0.0, metadata={"help": "MiSS dropout"}) + mini_r: int = field( + default=1, + metadata={ + "help": "The rank of MiSS corresponds to a low-rank decomposition along the out_features dimension.", + "note": "It is recommended that mini_r be divisible by out_features. When mini_r == out_features, the mini method is equivalent to the default efficient MiSS.", + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with MiSS.", + "example": "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' ", + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from MiSS."}, + ) + init_weights: bool | Literal["bat", "mini"] = field( + default=True, + metadata={ + "help": ( + "True -> MiSS balance; `bat` -> Bat; `mini` -> smaller rank and efficiency" + "Whether to initialize the weights of the MiSS layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + }, + ) + bias: str = field(default="none", metadata={"help": "Bias type for MiSS. Can be 'none', 'all' or 'MiSS_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from MiSS layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.MISS + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") diff --git a/peft/src/peft/tuners/miss/layer.py b/peft/src/peft/tuners/miss/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..dc238a036e987f699d8609543cce965dc5ca0506 --- /dev/null +++ b/peft/src/peft/tuners/miss/layer.py @@ -0,0 +1,393 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +class MissLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("miss_block",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("miss_r", "miss_dropout", "miss_mini_r") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.miss_r = {} + self.miss_dropout = nn.ModuleDict({}) + self.miss_mini_r = {} + self.miss_block = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def update_layer( + self, + adapter_name: str, + r: int, + mini_r: int, + miss_dropout, + init_weights: bool | str, + inference_mode: bool = False, + **kwargs, + ) -> None: + """Internal function to create miss adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + init_weights (`bool`): Whether to initialize weights. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.miss_r[adapter_name] = r + self.miss_mini_r[adapter_name] = mini_r + if miss_dropout > 0.0: + miss_dropout_layer = nn.Dropout(p=miss_dropout) + else: + miss_dropout_layer = nn.Identity() + + self.miss_dropout[adapter_name] = miss_dropout_layer + + # Determine shape of MiSS weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.miss_block[adapter_name] = nn.Parameter(torch.zeros(r, self.out_features), requires_grad=True) + else: + raise TypeError(f"MiSS is not implemented for base layers of type {type(base_layer).__name__}") + + # Initialize weights + if init_weights == "bat": + if self.in_features % r != 0 or self.out_features % r != 0: + raise ValueError("The weight matrix must be fully divisible into [r, r] blocks.") + self.reset_bat_parameters(adapter_name, r) + elif init_weights == "mini": + if self.out_features % mini_r != 0: + raise ValueError( + "mini_r is divided along the out_features dimension. For optimal performance and implementation simplicity," + "it is recommended that out_features be divisible by mini_r." + "Error: {self.out_features} % mini_r != 0" + ) + self.reset_mini_parameters(adapter_name, r, mini_r) + elif init_weights: + self.reset_miss_parameters(adapter_name, r) + else: + self.reset_miss_parameters_random(adapter_name) + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_miss_parameters(self, adapter_name: str, r): + self.miss_block[adapter_name] = nn.Parameter(torch.zeros(r, self.out_features), requires_grad=True) + + def reset_bat_parameters(self, adapter_name: str, r): + self.miss_block[adapter_name] = nn.Parameter(torch.zeros(self.out_features // r, r, r), requires_grad=True) + + def reset_mini_parameters(self, adapter_name: str, r, mini_r): + self.miss_block[adapter_name] = nn.Parameter(torch.zeros(r, mini_r), requires_grad=True) + + def reset_miss_parameters_random(self, adapter_name: str): + nn.init.kaiming_uniform_(self.miss_block[adapter_name], a=math.sqrt(5)) + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.miss_block.keys(): + continue + + warnings.warn("Scaling operation for MiSS not supported! Automatically set scale to 1.") + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.miss_block.keys(): + continue + + warnings.warn("Unscaling operation for MiSS not supported! Keeping scale at 1.") + + +class MissLinear(nn.Module, MissLayer): + """ + MiSS implemented in a dense layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + mini_r: int = 0, + miss_dropout: float = 0.0, + init_weights: Union[bool, str] = True, + **kwargs, + ) -> None: + super().__init__() + MissLayer.__init__(self, base_layer, **kwargs) + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, mini_r, miss_dropout, init_weights, **kwargs) + self.miss_fn = init_weights + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.miss_block.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + if self.miss_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, orig_weight) + orig_weight += delta_weight + elif self.miss_fn == "mini": + delta_weight = self.get_delta_weight_miss(active_adapter, self.base_layer.weight.data) + orig_weight = delta_weight + else: + delta_weight = self.get_delta_weight_miss(active_adapter, self.base_layer.weight.data) + orig_weight = delta_weight + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight.to(orig_dtype) + else: + if self.miss_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, self.base_layer.weight.data) + base_layer.weight.data += delta_weight.to(orig_dtype) + elif self.miss_fn == "mini": + delta_weight = self.get_delta_weight_miss(active_adapter, self.base_layer.weight.data) + base_layer.weight.data = delta_weight.to(orig_dtype) + else: + delta_weight = self.get_delta_weight_miss(active_adapter, self.base_layer.weight.data) + base_layer.weight.data = delta_weight.to(orig_dtype) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if active_adapter in self.miss_block.keys(): + orig_weight = self.get_base_layer().weight.data.clone() + if self.miss_fn == "bat": + delta_weight = self.get_delta_weight(active_adapter, orig_weight, re=True) + elif self.miss_fn == "mini": + delta_weight = self.get_delta_weight_miss(active_adapter, orig_weight, re=True) + else: + delta_weight = self.get_delta_weight_miss(active_adapter, orig_weight, re=True) + + base_layer.weight.data = delta_weight.to(orig_dtype) + + def get_delta_weight(self, adapter, orig_weight, re: bool = False) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.miss_block[adapter].device + dtype = self.miss_block[adapter].dtype + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_miss = self.miss_block[adapter] + + if cast_to_fp32: + weight_miss = weight_miss.float() + orig_weight = orig_weight.to(weight_miss.dtype) + + r = weight_miss.size(-1) + if re: + o = orig_weight.reshape(orig_weight.size(0) // r, r, orig_weight.size(1) // r, r).permute(2, 0, 1, 3) + one = torch.eye(weight_miss.size(-1)).to(weight_miss.device) + # inverse must be in float32, after that the dtype can be adjusted if needed + inv_I_plus_b = torch.inverse(one + weight_miss) + inv_I_plus_b = inv_I_plus_b.to(weight_miss.dtype) + w = (o - weight_miss) @ inv_I_plus_b + output_tensor = w.permute(1, 2, 0, 3).reshape(*orig_weight.shape) + else: + w = ( + orig_weight.reshape(orig_weight.size(0) // r, r, orig_weight.size(1) // r, r).permute(2, 0, 1, 3) + @ weight_miss + + weight_miss + ) + output_tensor = w.permute(1, 2, 0, 3).reshape(*orig_weight.shape) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.miss_block[adapter].data = weight_miss.to(dtype) + + return output_tensor + + def get_delta_weight_miss(self, adapter, orig_weight, re: bool = False) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.miss_block[adapter].device + dtype = self.miss_block[adapter].dtype + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_miss = self.miss_block[adapter] + + if cast_to_fp32: + weight_miss = weight_miss.float() + + in_features = orig_weight.size(-1) + out_features = orig_weight.size(0) + r = weight_miss.size(0) + if self.miss_fn == "mini": + weight_miss = weight_miss.repeat(1, out_features // self.miss_mini_r[adapter]) + + if in_features % r != 0: + last_size = in_features % r + n_block = in_features // r + n_block_size = n_block * r + + if re: + orig_weight[:, :n_block_size] = ( + (orig_weight[:, :n_block_size].reshape(-1, n_block, r).permute(1, 2, 0) - weight_miss) + .permute(2, 0, 1) + .reshape(*orig_weight[:, :n_block_size].shape) + ) + orig_weight[:, n_block_size:] = ( + orig_weight[:, n_block_size:] - (weight_miss.transpose(0, 1))[:, :last_size] + ) + else: + orig_weight[:, :n_block_size] = ( + (orig_weight[:, :n_block_size].reshape(-1, n_block, r).permute(1, 2, 0) + weight_miss) + .permute(2, 0, 1) + .reshape(*orig_weight[:, :n_block_size].shape) + ) + orig_weight[:, n_block_size:] = ( + orig_weight[:, n_block_size:] + (weight_miss.transpose(0, 1))[:, :last_size] + ) + output_tensor = orig_weight + + else: + if re: + w = orig_weight.reshape(-1, orig_weight.size(1) // r, r).permute(1, 2, 0) - weight_miss + output_tensor = w.permute(2, 0, 1).reshape(*orig_weight.shape) + else: + w = orig_weight.reshape(-1, orig_weight.size(1) // r, r).permute(1, 2, 0) + weight_miss + output_tensor = w.permute(2, 0, 1).reshape(*orig_weight.shape) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.miss_block[adapter].data = weight_miss.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + if self.miss_fn == "bat": + orig_weight = self.base_layer.weight.data.clone() + for active_adapter in self.active_adapters: + if active_adapter not in self.miss_block.keys(): + continue + delta_weight = self.get_delta_weight(active_adapter, orig_weight) + orig_weight = orig_weight + delta_weight + + x = self._cast_input_dtype(x, orig_weight.dtype) + bias = self._cast_input_dtype(self.base_layer.bias, orig_weight.dtype) + result = F.linear(input=x, weight=orig_weight, bias=bias) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.miss_block.keys(): + continue + miss = self.miss_block[active_adapter] + if self.miss_fn == "mini": + miss = miss.repeat(1, self.base_layer.out_features // self.miss_mini_r[active_adapter]) + + dropout = self.miss_dropout[active_adapter] + r = miss.size(0) + if x.size(-1) % r != 0: + padding_size = (r - x.size(-1) % r) % r + x = F.pad(x, (0, padding_size)) + x = self._cast_input_dtype(x, miss.dtype) + result = result + torch.sum(dropout(x).reshape(*x.shape[:-1], x.size(-1) // r, r), dim=-2) @ miss + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "miss." + rep diff --git a/peft/src/peft/tuners/miss/model.py b/peft/src/peft/tuners/miss/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7dd6e655e181a0a3e495e1d38edf0d06c48d1e57 --- /dev/null +++ b/peft/src/peft/tuners/miss/model.py @@ -0,0 +1,130 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING + +from .layer import MissLayer, MissLinear + + +class MissModel(BaseTuner): + """ + Creates Householder reflection adaptation (MiSS) model from a pretrained model. The method is described in + https://huggingface.co/papers/2409.15371 + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`MissConfig`]): The configuration of the MiSS model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The MiSS model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import MissModel, MissConfig + + >>> config_te = MissConfig( + ... r=8, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... init_weights=True, + ... ) + >>> config_unet = MissConfig( + ... r=8, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... init_weights=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = MissModel(model.text_encoder, config_te, "default") + >>> model.unet = MissModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`MissConfig`]): The configuration of the MiSS model. + """ + + prefix: str = "miss_" + tuner_layer_cls = MissLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + miss_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": miss_config.r, + "mini_r": miss_config.mini_r, + "miss_dropout": miss_config.miss_dropout, + "init_weights": miss_config.init_weights, + } + kwargs["bias"] = bias + + # If it is not a MissLayer, create a new module, else update it with new adapters + if not isinstance(target, MissLayer): + new_module = self._create_new_module(miss_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + r=miss_config.r, + init_weights=miss_config.init_weights, + miss_dropout=miss_config.miss_dropout, + mini_r=miss_config.mini_r, + ) + + @staticmethod + def _create_new_module(miss_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + new_module = MissLinear(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only `torch.nn.Linear` is supported." + ) + + return new_module diff --git a/peft/src/peft/tuners/mixed/__init__.py b/peft/src/peft/tuners/mixed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2955d7258ddcf76b47b38fd6fd5ebeb3d1d6110c --- /dev/null +++ b/peft/src/peft/tuners/mixed/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .model import COMPATIBLE_TUNER_TYPES, MixedModel + + +__all__ = ["COMPATIBLE_TUNER_TYPES", "MixedModel"] diff --git a/peft/src/peft/tuners/mixed/model.py b/peft/src/peft/tuners/mixed/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8b009c4a960a1a75244e7f53b24daff75ac64dfd --- /dev/null +++ b/peft/src/peft/tuners/mixed/model.py @@ -0,0 +1,296 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional, Union + +from torch import nn +from tqdm import tqdm + +from peft.tuners import adalora, loha, lokr, lora, oft, shira +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, _delete_auxiliary_adapter +from peft.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + PeftType, + _get_submodules, + get_auto_gptq_quant_linear, +) + + +# Collection of constants used for all tuners +COMPATIBLE_TUNER_TYPES = (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.OFT, PeftType.SHIRA) +PREFIXES = [ + lora.LoraModel.prefix, + lokr.LoKrModel.prefix, + loha.LoHaModel.prefix, + oft.OFTModel.prefix, + shira.ShiraModel.prefix, +] +Configs = Union[ + lora.LoraConfig, loha.LoHaConfig, lokr.LoKrConfig, adalora.AdaLoraConfig, oft.OFTConfig, shira.ShiraConfig +] +Layers = ( + lora.layer.LoraLayer, + loha.layer.LoHaLayer, + lokr.layer.LoKrLayer, + adalora.layer.AdaLoraLayer, + oft.OFTLayer, + shira.ShiraLayer, +) + + +class MixedModel(BaseTuner): + """ + A class that allows to mix different types of adapters in a single model. + + Note: This class should usually not be initialized directly. Instead, use `get_peft_model` with the argument + `mixed=True`. + + Args: + model (:obj:`nn.Module`): + The model to be tuned. + config (:obj:`PeftConfig`): + The config of the model to be tuned. The adapter type must be compatible. + adapter_name (:obj:`str`): + The name of the first adapter. + """ + + def __init__(self, model: nn.Module, config: Configs, adapter_name: str) -> None: + super().__init__(model, config, adapter_name) + + def _check_new_adapter_config(self, config: Configs) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + if not isinstance(config, Configs.__args__): + raise ValueError( + f"{self.__class__.__name__} only supports {COMPATIBLE_TUNER_TYPES} configs, but got {type(config)}." + ) + + super()._check_new_adapter_config(config) + + def _create_and_replace( + self, + config: Configs, + *args: Any, + **kwargs: Any, + ) -> None: + if isinstance(config, adalora.AdaLoraConfig): + adalora.AdaLoraModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, lora.LoraConfig): + lora.LoraModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, loha.LoHaConfig): + loha.LoHaModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, lokr.LoKrConfig): + lokr.LoKrModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, oft.OFTConfig): + oft.OFTModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, shira.ShiraConfig): + shira.ShiraModel._create_and_replace(self, config, *args, **kwargs) + else: + raise ValueError(f"Unsupported config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.") + + def _replace_module(self, parent, child_name, new_module, child) -> None: + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.get_base_layer() + elif hasattr(child, "quant_linear_module"): + # TODO maybe not necessary to have special treatment? + child = child.quant_linear_module + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if any(prefix in name for prefix in PREFIXES): + module.to(child.weight.device) + if "ranknum" in name: + module.to(child.weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if not any(prefix in n for prefix in PREFIXES): + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = getattr(self.peft_config[active_adapter], "bias", "none") + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "lora_only": + # TODO: check if this is needed for other supported types + for m in model.modules(): + if isinstance(m, Layers) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise ValueError(f"Requested bias: {bias}, is not implemented.") + + @staticmethod + def _create_new_module(config, adapter_name, target, **kwargs): + gptq_quantization_config = kwargs.get("gptq_quantization_config", None) + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config) + if (gptq_quantization_config is not None) or (AutoGPTQQuantLinear is not None): + raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).") + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + if loaded_in_8bit or loaded_in_4bit: + raise ValueError(f"8bit and 4bit quantization not supported for {config.peft_type.value} (yet).") + + if isinstance(config, adalora.AdaLoraConfig): + new_module = adalora.AdaLoraModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, lora.LoraConfig): + new_module = lora.LoraModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, loha.LoHaConfig): + new_module = loha.LoHaModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, lokr.LoKrConfig): + new_module = lokr.LoKrModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, oft.OFTConfig): + new_module = oft.OFTModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, shira.ShiraConfig): + new_module = shira.ShiraModel._create_new_module(config, adapter_name, target, **kwargs) + else: + raise ValueError(f"Unknown config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.") + return new_module + + def set_adapter(self, adapter_name: Union[str, list[str]], inference_mode: bool = False) -> None: + self.set_auxiliary_adapters(adapter_name, inference_mode=inference_mode) + for module in self.model.modules(): + if isinstance(module, Layers): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name, inference_mode=inference_mode) + self.active_adapter = adapter_name + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + if merge: + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge layers when the model is gptq quantized") + + def merge_recursively(module): + # helper function to recursively merge the base_layer of the target + path = [] + layer = module + while hasattr(layer, "base_layer"): + path.append(layer) + layer = layer.base_layer + for layer_before, layer_after in zip(path[:-1], path[1:]): + layer_after.merge(safe_merge=safe_merge, adapter_names=adapter_names) + layer_before.base_layer = layer_after.base_layer + module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + + key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)] + desc = "Unloading " + ("and merging " if merge else "") + "model" + + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + merge_recursively(target) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def add_weighted_adapter(self, *args: Any, **kwargs: Any) -> None: + raise NotImplementedError(f"Weighted adapters are not supported for {self.__class__.__name__} (yet).") + + def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (Union[str, list[str]]): Name of the adapter(s) to delete. + """ + if isinstance(adapter_name, str): + adapter_names = [adapter_name] + else: + adapter_names = adapter_name + + mismatched = set(adapter_names) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + for adapter_name in adapter_names: + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, BaseTunerLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] + _delete_auxiliary_adapter(self.model, adapter_name, new_active_adapters=new_adapter) + + def generate(self, *args: Any, **kwargs: Any): + return self.model.generate(*args, **kwargs) diff --git a/peft/src/peft/tuners/multitask_prompt_tuning/__init__.py b/peft/src/peft/tuners/multitask_prompt_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe692a93378254601753cc9b17218fdd5a92b8de --- /dev/null +++ b/peft/src/peft/tuners/multitask_prompt_tuning/__init__.py @@ -0,0 +1,25 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit +from .model import MultitaskPromptEmbedding + + +__all__ = ["MultitaskPromptEmbedding", "MultitaskPromptTuningConfig", "MultitaskPromptTuningInit"] + +register_peft_method( + name="multitask_prompt_tuning", config_cls=MultitaskPromptTuningConfig, model_cls=MultitaskPromptEmbedding +) diff --git a/peft/src/peft/tuners/multitask_prompt_tuning/config.py b/peft/src/peft/tuners/multitask_prompt_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6cb279573559e4fa33a973aa35e052647a6ab8f9 --- /dev/null +++ b/peft/src/peft/tuners/multitask_prompt_tuning/config.py @@ -0,0 +1,62 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.tuners.prompt_tuning import PromptTuningConfig +from peft.utils import PeftType + + +class MultitaskPromptTuningInit(str, enum.Enum): + # initialize prompt with text + TEXT = "TEXT" + # initialize prompt with random matrix + RANDOM = "RANDOM" + # average the prefix and column matrices obtained during source training + AVERAGE_SOURCE_TASKS = "AVERAGE_SOURCE_TASKS" + # pick prefix and column matrices for a particular task obtained during source training + EXACT_SOURCE_TASK = "EXACT_SOURCE_TASK" + # only use the prompt embeddings trained during source training + ONLY_SOURCE_SHARED = "ONLY_SOURCE_SHARED" + + +@dataclass +class MultitaskPromptTuningConfig(PromptTuningConfig): + prompt_tuning_init: Union[MultitaskPromptTuningInit, str] = field( + default=MultitaskPromptTuningInit.RANDOM, + metadata={ + "help": ( + "How to initialize the prompt tuning parameters. Can be one of TEXT, RANDOM, AVERAGE_SOURCE_TASKS, " + "EXACT_SOURCE_TASK, ONLY_SOURCE_SHARED." + ), + }, + ) + prompt_tuning_init_state_dict_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The path of source state dict. This is required when training the downstream target prompt from " + "the pretrained source prompt" + ), + }, + ) + prompt_tuning_init_task: Optional[int] = field(default=0, metadata={"help": "source task id for initialization"}) + num_ranks: Optional[int] = field(default=1, metadata={"help": "ranks"}) + num_tasks: Optional[int] = field(default=1, metadata={"help": "number of tasks"}) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.MULTITASK_PROMPT_TUNING diff --git a/peft/src/peft/tuners/multitask_prompt_tuning/model.py b/peft/src/peft/tuners/multitask_prompt_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..20d9d498f48c4f3bc7087c38b4f8072d8de66637 --- /dev/null +++ b/peft/src/peft/tuners/multitask_prompt_tuning/model.py @@ -0,0 +1,120 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from peft.tuners.prompt_tuning import PromptEmbedding +from peft.utils import TaskType +from peft.utils.save_and_load import torch_load + +from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit + + +# This code is adapted for the paper: https://huggingface.co/papers/2303.02861 and +# constitutes the work done at MIT-IBM Watson Research Lab. + + +class MultitaskPromptEmbedding(PromptEmbedding): + def __init__(self, config: MultitaskPromptTuningConfig, word_embeddings): + super().__init__(config, word_embeddings) + + self.num_tasks = config.num_tasks + self.num_ranks = config.num_ranks + self.num_virtual_tokens = config.num_virtual_tokens + + self.num_transformer_submodules = config.num_transformer_submodules + if self.num_transformer_submodules is None: + self.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 + + self.token_dim = config.token_dim + + total_virtual_tokens = self.num_virtual_tokens * self.num_transformer_submodules + + self.prefix_task_cols = torch.nn.Parameter( + torch.normal( + mean=0, + std=0.02, + size=(self.num_tasks, total_virtual_tokens, self.num_ranks), + ) + ) + self.prefix_task_rows = torch.nn.Parameter( + torch.normal( + mean=0, + std=0.02, + size=(self.num_tasks, self.num_ranks, self.token_dim), + ) + ) + + if config.prompt_tuning_init in [ + MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS, + MultitaskPromptTuningInit.EXACT_SOURCE_TASK, + MultitaskPromptTuningInit.ONLY_SOURCE_SHARED, + ]: + if config.prompt_tuning_init_state_dict_path is None: + raise ValueError( + f"prompt_tuning_init_state_dict_path needs to be specified with {config.prompt_tuning_init} " + "init method" + ) + + if config.prompt_tuning_init_state_dict_path.endswith(".safetensors"): + from safetensors.torch import load_file + + state_dict: dict = load_file(config.prompt_tuning_init_state_dict_path) + else: + state_dict: dict = torch_load( + config.prompt_tuning_init_state_dict_path, + map_location=word_embeddings.weight.device, + ) + + if config.prompt_tuning_init in [ + MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS, + MultitaskPromptTuningInit.EXACT_SOURCE_TASK, + ]: + prefix_task_cols_: torch.Tensor = state_dict["prefix_task_cols"] + prefix_task_rows_: torch.Tensor = state_dict["prefix_task_rows"] + + if config.prompt_tuning_init == MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS: + prefix_task_cols_ = prefix_task_cols_.mean(0, keepdim=True) + prefix_task_rows_ = prefix_task_rows_.mean(0, keepdim=True) + elif config.prompt_tuning_init == MultitaskPromptTuningInit.EXACT_SOURCE_TASK: + prefix_task_cols_ = prefix_task_cols_[config.prompt_tuning_init_task, ...].unsqueeze(0) + prefix_task_rows_ = prefix_task_rows_[config.prompt_tuning_init_task, ...].unsqueeze(0) + + state_dict = { + "embedding.weight": state_dict["prompt_embeddings"], + "prefix_task_cols": prefix_task_cols_, + "prefix_task_rows": prefix_task_rows_, + } + + self.load_state_dict(state_dict, strict=True) + elif config.prompt_tuning_init == MultitaskPromptTuningInit.ONLY_SOURCE_SHARED: + state_dict = { + "embedding.weight": state_dict["prompt_embeddings"], + } + + self.load_state_dict(state_dict, strict=False) + + def forward(self, indices, task_ids): + if task_ids is None: + raise ValueError("task_ids cannot be None") + + prompt_embeddings = self.embedding(indices) + + task_cols = torch.index_select(self.prefix_task_cols, 0, task_ids) + task_rows = torch.index_select(self.prefix_task_rows, 0, task_ids) + task_prompts = torch.matmul(task_cols, task_rows) + + prompt_embeddings *= task_prompts + + return prompt_embeddings diff --git a/peft/src/peft/tuners/oft/__init__.py b/peft/src/peft/tuners/oft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..32e21e32f64b02315731642542cf0bf31227d7a4 --- /dev/null +++ b/peft/src/peft/tuners/oft/__init__.py @@ -0,0 +1,52 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available +from peft.utils import register_peft_method + +from .config import OFTConfig +from .gptq import GPTQOFTLinear +from .layer import Conv2d, Linear, OFTLayer +from .model import OFTModel + + +__all__ = [ + "Conv2d", + "GPTQOFTLinear", + "Linear", + "OFTConfig", + "OFTLayer", + "OFTModel", +] + +register_peft_method(name="oft", config_cls=OFTConfig, model_cls=OFTModel) + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + if (name == "EetqOFTLinear") and is_eetq_available(): + from .eetq import EetqOFTLinear + + return EetqOFTLinear + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/oft/aqlm.py b/peft/src/peft/tuners/oft/aqlm.py new file mode 100644 index 0000000000000000000000000000000000000000..d455d2fe4149be2d78e8386c1aee668de4a74829 --- /dev/null +++ b/peft/src/peft/tuners/oft/aqlm.py @@ -0,0 +1,105 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +import torch + +from peft.import_utils import is_aqlm_available +from peft.tuners.oft.layer import OFTLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +if is_aqlm_available(): + from aqlm import QuantizedLinear + + +class AqlmOFTLinear(torch.nn.Module, OFTLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + oft_block_size: int = 32, + module_dropout: float = 0.0, + init_weights: bool = True, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ): + super().__init__() + OFTLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + init_weights=init_weights, + coft=coft, + eps=eps, + block_share=block_share, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + if self.disable_adapters: + return self.base_layer(x) + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + + result = self.base_layer(x) + if requires_conversion: + result = result.to(expected_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_aqlm( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_aqlm_available() and isinstance(target_base_layer, QuantizedLinear): + new_module = AqlmOFTLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.codes + + return new_module diff --git a/peft/src/peft/tuners/oft/awq.py b/peft/src/peft/tuners/oft/awq.py new file mode 100644 index 0000000000000000000000000000000000000000..4af5addaede9e098e6a87b5212436936f9426c80 --- /dev/null +++ b/peft/src/peft/tuners/oft/awq.py @@ -0,0 +1,119 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib.metadata as importlib_metadata +from typing import Any, Optional + +import packaging.version +import torch + +from peft.import_utils import is_auto_awq_available +from peft.tuners.oft.layer import OFTLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +class AwqOFTLinear(torch.nn.Module, OFTLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + oft_block_size: int = 32, + module_dropout: float = 0.0, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: bool = True, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ): + super().__init__() + OFTLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def forward(self, x: torch.Tensor): + if self.disable_adapters: + result = self.quant_linear_module(x) + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + if requires_conversion: + x = x.to(expected_dtype) + + result = self.quant_linear_module(x) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_awq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_auto_awq_available(): + from awq.modules.linear import WQLinear_GEMM + + if isinstance(target_base_layer, WQLinear_GEMM): + # Raise the error only at the dispatch level + AUTOAWQ_MINIMUM_VERSION = packaging.version.parse("0.2.0") + version_autoawq = packaging.version.parse(importlib_metadata.version("autoawq")) + + if AUTOAWQ_MINIMUM_VERSION > version_autoawq: + raise ImportError( + f"Found an incompatible version of auto-awq. Found version {version_autoawq}, " + f"but only versions above {AUTOAWQ_MINIMUM_VERSION} are supported for PEFT." + ) + + new_module = AwqOFTLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/peft/src/peft/tuners/oft/bnb.py b/peft/src/peft/tuners/oft/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..a5da56279bac877da807faacbb74121fb10e3875 --- /dev/null +++ b/peft/src/peft/tuners/oft/bnb.py @@ -0,0 +1,388 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Optional + +import bitsandbytes as bnb +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.integrations import dequantize_bnb_weight + +from .layer import OFTLayer + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, OFTLayer): + # OFT implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + module_dropout: float = 0.0, + init_weights: bool = True, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ) -> None: + super().__init__() + OFTLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.oft_R.keys(): + continue + + warnings.warn("Merge oft module to 8-bit linear may get different generations due to rounding errors.") + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8 + # dequantization directly + output = dequantize_bnb_weight(weight, state=state) + oft_data = self.get_delta_weight(active_adapter) + + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data, output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = output.to(oft_data.dtype).to(oft_data.device) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.oft_R.keys(): + continue + warnings.warn( + "Unmerge oft module to 8-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + output = dequantize_bnb_weight(weight, state=state) + + oft_data = self.get_delta_weight(active_adapter) + + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data.t(), output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = w_data.to(oft_data.dtype).to(oft_data.device) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + state.reset_grads() + + def get_delta_weight(self, adapter): + return self.oft_R[adapter].get_weight() + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + if requires_conversion: + x = x.to(expected_dtype) + + result = self.base_layer(x, *args, **kwargs) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + def dispatch_bnb_8bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target.state.has_fp16_weights, + "threshold": target.state.threshold, + "index": target.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs) + + return new_module + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, OFTLayer): + # OFT implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + module_dropout: float = 0.0, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + init_weights: bool = True, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ) -> None: + super().__init__() + OFTLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.oft_R.keys(): + continue + + warnings.warn("Merge oft module to 4-bit linear may get different generations due to rounding errors.") + # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930 + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + + output = dequantize_bnb_weight(weight, state=weight.quant_state) + + oft_data = self.get_delta_weight(active_adapter) + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data, output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = output.to(oft_data.dtype).to(oft_data.device) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + # torch.compile can introduce attributes preceded by '_', remove them + kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")} + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.oft_R.keys(): + continue + warnings.warn( + "Unmerge oft module to 4-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + output = dequantize_bnb_weight(weight, state=weight.quant_state) + + oft_data = self.get_delta_weight(active_adapter) + + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data.t(), output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = output.to(oft_data.dtype).to(oft_data.device) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + def get_delta_weight(self, adapter): + return self.oft_R[adapter].get_weight() + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + # result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + if requires_conversion: + x = x.to(expected_dtype) + + result = self.base_layer(x, *args, **kwargs) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + def dispatch_bnb_4bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + if loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, **fourbit_kwargs) + + return new_module diff --git a/peft/src/peft/tuners/oft/config.py b/peft/src/peft/tuners/oft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4b33f13cb362a8eec851c5750e16bd50f109ddc6 --- /dev/null +++ b/peft/src/peft/tuners/oft/config.py @@ -0,0 +1,196 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class OFTConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`OFTModel`]. + + Args: + r (`int`): OFT rank, number of OFT blocks per injected layer. + oft_block_size (`int`): OFT block size across different layers. + module_dropout (`float`): + The multiplicative dropout probability, by setting OFT blocks to identity during training, similar to the + dropout layer in LoRA. + target_modules (`Optional[Union[list[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding + the output layer. If this is not specified, modules will be chosen according to the model architecture. If + the architecture is not known, an error will be raised -- in this case, you should specify the target + modules manually. + fan_in_fan_out (`bool`): Set this to True if the layer to replace stores weight like (fan_in, fan_out). + bias (`str`): Bias type for OFT. Can be 'none', 'all' or 'oft_only'. If 'all' or 'oft_only', the + corresponding biases will be updated during training. Be aware that this means that, even when disabling + the adapters, the model will not produce the same output as the base model would have without adaptation. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + init_weights (`bool`): + Whether to perform initialization of OFT weights. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + coft (`bool`): + Whether to use the constrained variant of OFT or not, off by default. + eps (`float`): + The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True. + block_share (`bool`): + Whether to share the OFT parameters between blocks or not. This is `False` by default. + """ + + r: int = field(default=0, metadata={"help": "OFT rank, number of OFT blocks per injected layer."}) + oft_block_size: int = field( + default=32, + metadata={ + "help": "OFT block size across different layers.", + "note": "You can only specify either r or oft_block_size, but not both simultaneously, because r x oft_block_size = layer dimension.", + }, + ) + module_dropout: float = field( + default=0.0, + metadata={ + "help": "OFT multiplicative dropout, randomly setting blocks of OFT to be identity matrix, similar to the dropout layer in LoRA." + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with OFT." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: Literal["none", "all", "oft_only"] = field( + default="none", metadata={"help": "Bias type for OFT. Can be 'none', 'all' or 'oft_only'"} + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from OFT."}, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the OFT layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern. " + "This should target the `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`." + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + coft: bool = field( + default=False, + metadata={"help": "Whether to use the constrained variant of OFT or not."}, + ) + eps: float = field( + default=6e-5, + metadata={ + "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True." + }, + ) + block_share: bool = field( + default=False, + metadata={"help": "Whether to share the OFT parameters between blocks or not."}, + ) + use_cayley_neumann: bool = field( + default=True, + metadata={ + "help": "Whether to use the Cayley-Neumann Formulation of OFT or not. Set to True to improve computational efficiency but comes at costs of bigger approximation error for orthogonality." + }, + ) + num_cayley_neumann_terms: int = field( + default=5, + metadata={ + "help": "Number of Cayley-Neumann terms to use. Higher number results in less approximation error for orthogonality." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.OFT + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + if self.r == 0 and self.oft_block_size == 0: + raise ValueError( + f"Either `r` or `oft_block_size` must be non-zero. Currently, r = {self.r} and oft_block_size = {self.oft_block_size}." + ) + if not (self.r != 0) ^ (self.oft_block_size != 0): + raise ValueError( + f"You can only specify either r ({self.r}) or oft_block_size ({self.oft_block_size}), but not both simultaneously, because r x oft_block_size == in_features." + ) + + @classmethod + def check_kwargs(cls, **kwargs): + r""" + Check if the kwargs are valid for the configuration. + + Args: + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the child class initialization. + """ + if "oft_block_size" not in kwargs: + raise ValueError( + "OFT has been updated since PEFT 0.14.0. Your trained adapter weights are incompatible " + "with the latest version of OFT. Please retrain your adapter weights with newer PEFT versions. " + "Alternatively, downgrade PEFT to version 0.13.0 to use the old adapter weights." + ) + return super().check_kwargs(**kwargs) diff --git a/peft/src/peft/tuners/oft/eetq.py b/peft/src/peft/tuners/oft/eetq.py new file mode 100644 index 0000000000000000000000000000000000000000..2d6538165a4f040617e68ff1bceca494f112c3e5 --- /dev/null +++ b/peft/src/peft/tuners/oft/eetq.py @@ -0,0 +1,116 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from peft.import_utils import is_eetq_available +from peft.tuners.oft.layer import OFTLayer +from peft.tuners.tuners_utils import BaseTunerLayer + + +if is_eetq_available(): + from eetq import EetqLinear + + class EetqOFTLinear(torch.nn.Module, OFTLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + oft_block_size: int = 0, + module_dropout: float = 0.0, + init_weights: bool = True, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + fan_in_fan_out: bool = False, + **kwargs, + ): + super().__init__() + OFTLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + init_weights=init_weights, + coft=coft, + eps=eps, + block_share=block_share, + fan_in_fan_out=fan_in_fan_out, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def forward(self, x: torch.Tensor): + if self.disable_adapters: + return self.quant_linear_module(x) + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + + result = self.quant_linear_module(x) + if requires_conversion: + result = result.to(expected_dtype) + return result + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + raise AttributeError("Merging LoRA layers is not supported for Eetq layers.") + + def unmerge(self) -> None: + raise AttributeError("Unmerging LoRA layers is not supported for Eetq layers.") + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_eetq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_eetq_available() and isinstance(target_base_layer, EetqLinear): + new_module = EetqOFTLinear(target, adapter_name, **kwargs) + target.weight = target_base_layer.weight + + if hasattr(target, "bias"): + target.bias = target_base_layer.bias + + return new_module diff --git a/peft/src/peft/tuners/oft/gptq.py b/peft/src/peft/tuners/oft/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..e64c1319522a94f366149ca4cb028cfc918bd124 --- /dev/null +++ b/peft/src/peft/tuners/oft/gptq.py @@ -0,0 +1,118 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional + +import torch + +from peft.import_utils import is_gptqmodel_available +from peft.tuners.oft.layer import OFTLayer +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import get_auto_gptq_quant_linear + + +class GPTQOFTLinear(torch.nn.Module, OFTLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + module_dropout: float = 0.0, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: bool = True, + **kwargs, + ): + super().__init__() + OFTLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + result = self.quant_linear_module(x) + + if self.disable_adapters: + return self.quant_linear_module(x) + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + + result = self.quant_linear_module(x) + if requires_conversion: + result = result.to(expected_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_gptq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + cfg = kwargs.get("gptq_quantization_config", None) + + if is_gptqmodel_available(): + from gptqmodel.nn_modules.qlinear import BaseQuantLinear + + if isinstance(target_base_layer, BaseQuantLinear): + new_module = GPTQOFTLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + else: + quant_linear = get_auto_gptq_quant_linear(cfg) + + if quant_linear is not None and isinstance(target_base_layer, quant_linear): + new_module = GPTQOFTLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/peft/src/peft/tuners/oft/hqq.py b/peft/src/peft/tuners/oft/hqq.py new file mode 100644 index 0000000000000000000000000000000000000000..5f83dd11b0bb4f4c349dd1beb95733b5e617c158 --- /dev/null +++ b/peft/src/peft/tuners/oft/hqq.py @@ -0,0 +1,186 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +import warnings +from typing import Optional + +import torch + +from peft.import_utils import is_hqq_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .layer import OFTLayer + + +if is_hqq_available(): + from hqq.core.quantize import HQQLinear + + class HqqOFTLinear(torch.nn.Module, OFTLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + module_dropout: float = 0.0, + init_weights: bool = True, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ) -> None: + super().__init__() + OFTLayer.__init__(self, base_layer) + self.fan_in_fan_out = False + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + init_weights=init_weights, + coft=coft, + eps=eps, + block_share=block_share, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + + layer = self.get_base_layer() + quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta} + + output = layer.dequantize() + oft_data = self.get_delta_weight(active_adapter) + + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data, output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = output.to(oft_data.dtype).to(oft_data.device) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device) + quant_config.pop("offload_meta", None) + new_hqq_layer.quantize(w_data, **quant_config) + self.base_layer = new_hqq_layer + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.oft_R.keys(): + continue + + layer = self.get_base_layer() + quant_config = {**copy.deepcopy(layer.quant_config), "offload_meta": layer.offload_meta} + output = layer.dequantize() + + oft_data = self.get_delta_weight(active_adapter) + + output = torch.transpose(output, 0, 1) + w_data = torch.mm(oft_data.t(), output.to(oft_data.dtype)) + w_data = torch.transpose(w_data, 0, 1) + w_data = w_data.to(oft_data.dtype).to(oft_data.device) + + new_hqq_layer = HQQLinear(None, quant_config, compute_dtype=layer.compute_dtype, device=layer.device) + quant_config.pop("offload_meta", None) + new_hqq_layer.quantize(w_data, **quant_config) + self.base_layer = new_hqq_layer + + def get_delta_weight(self, adapter): + return self.oft_R[adapter].get_weight() + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = x.dtype + x = self._cast_input_dtype(x, oft_R.weight.dtype) + + x = oft_R(x) + + result = self.base_layer(x, *args, **kwargs) + if requires_conversion: + result = result.to(expected_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_hqq(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_hqq_available() and isinstance(target_base_layer, HQQLinear): + new_module = HqqOFTLinear(target_base_layer, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/oft/inc.py b/peft/src/peft/tuners/oft/inc.py new file mode 100644 index 0000000000000000000000000000000000000000..6ed855bc7dc65af10d77a2f486abb8fc27c12dd5 --- /dev/null +++ b/peft/src/peft/tuners/oft/inc.py @@ -0,0 +1,78 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NOTE: PEFT tests related to INC are handled under Optimum-Habana repository: +# - LLMs: https://github.com/huggingface/optimum-habana/blob/main/tests/test_peft_inference.py +# - Diffusers: https://github.com/huggingface/optimum-habana/blob/main/tests/test_diffusers.py + +from typing import Optional + +import torch + +from peft.import_utils import is_inc_available +from peft.tuners.tuners_utils import BaseTunerLayer + +from .layer import Linear + + +if is_inc_available(): + + class IncOFTLinear(Linear): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + **kwargs, + ): + super().__init__(base_layer, adapter_name, **kwargs) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + raise NotImplementedError("Merging OFT with INC layers is not yet implemented") + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + raise NotImplementedError("Unmerging OFT from INC layers is not yet implemented") + + +def dispatch_inc(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_inc_available(): + from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import ( + PatchedLinear, + ) + + if isinstance(target_base_layer, PatchedLinear): + new_module = IncOFTLinear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/oft/layer.py b/peft/src/peft/tuners/oft/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..6b14d015ae9b98f5baf34f5d4ba343a4900c32b4 --- /dev/null +++ b/peft/src/peft/tuners/oft/layer.py @@ -0,0 +1,938 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .config import OFTConfig + + +class MultiplicativeDropoutLayer(nn.Module): + """ + Implements the multiplicative dropout layer for OFT. + """ + + def __init__(self, p=0.0): + """ + Initializes the multiplicative dropout layer. + + Parameters: + p (float): The probability of dropping out a block. Defaults to 0.0. + """ + super().__init__() + self.p = p + + def forward(self, x): + """ + Applies multiplicative dropout to the input tensor. + + Parameters: + x (Tensor): The input tensor of shape (D, H, H), where `D` represents + the number of OFT blocks, and `H` is the size of the square blocks along the last two dimensions, + the block size in OFT. + """ + if self.training and self.p > 0: + # Ensure the last two dimensions are the same + if x.shape[-1] != x.shape[-2]: + raise ValueError("The last two dimensions of input should be the same!") + + D, H, _ = x.shape + + # If block share, skip the multiplicative dropout + if D == 1: + return x + + num_to_replace = int(self.p * D) + num_zeros = D - num_to_replace + mask = torch.cat([torch.ones(num_to_replace, device=x.device), torch.zeros(num_zeros, device=x.device)]) + mask = mask[torch.randperm(D)].view(D, 1, 1) + eye_matrix = torch.eye(H, device=x.device).repeat(D, 1, 1) + x = (1 - mask) * x + mask * eye_matrix + return x + + +class OFTRotationModule(nn.Module): + def __init__( + self, + r, + n_elements, + block_size, + in_features, + coft=False, + eps=6e-5, + block_share=False, + kernel_size=(0, 0), + use_cayley_neumann=True, + num_cayley_neumann_terms=5, + ): + super().__init__() + self.r = r + self.n_elements = n_elements + self.block_size = block_size + self.in_features = in_features + self.weight = nn.Parameter(torch.empty(r, n_elements)) + self.coft = coft + self.eps = eps + self.block_share = block_share + # Conv2d specific parameters + self.kernel_size = kernel_size + self.use_cayley_neumann = use_cayley_neumann + self.num_cayley_neumann_terms = num_cayley_neumann_terms + # Create indices for upper triangle (excluding diagonal) + self.rows, self.cols = torch.triu_indices(block_size, block_size, 1) + + def _pytorch_skew_symmetric(self, vec, block_size): + batch_size = vec.shape[0] + matrix = torch.zeros(batch_size, block_size, block_size, device=vec.device, dtype=vec.dtype) + + matrix[:, self.rows, self.cols] = vec + matrix = matrix - matrix.transpose(-2, -1) + return matrix + + def _pytorch_skew_symmetric_inv(self, matrix, block_size): + batch_size = matrix.shape[0] + + # Extract the upper triangular elements + vec = matrix[:, self.rows, self.cols] + return vec + + def _cayley_batch( + self, Q: torch.Tensor, block_size: int, use_cayley_neumann: bool = True, num_neumann_terms: int = 5 + ) -> torch.Tensor: + """ + Perform the Cayley parametrization on a batch of skew-symmetric matrices. + + Args: + data: A batch of skew-symmetric matrices of shape (b, r, c). + """ + + b, _ = Q.shape + previous_dtype = Q.dtype + + # Q_skew = SkewSymmetric.apply(Q, block_size) + Q_skew = self._pytorch_skew_symmetric(Q, block_size) + + if use_cayley_neumann: + R = torch.eye(block_size, device=Q.device, dtype=Q.dtype).repeat(b, 1, 1) + if num_neumann_terms > 1: + R.add_(Q_skew, alpha=2.0) + if num_neumann_terms > 2: + Q_squared = torch.bmm(Q_skew, Q_skew) + R.add_(Q_squared, alpha=2.0) + + Q_power = Q_squared + for i in range(3, num_neumann_terms): + Q_power = torch.bmm(Q_power, Q_skew) + R.add_(Q_power, alpha=2.0) + else: + id_mat = ( + torch.eye(Q_skew.shape[-1], device=Q_skew.device) + .unsqueeze(0) + .expand(b, Q_skew.shape[-1], Q_skew.shape[-1]) + ) + R = torch.linalg.solve(id_mat + Q_skew, id_mat - Q_skew, left=False) + + return R.to(previous_dtype) + + # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52 + def _project_batch(self, Q, eps=1e-5): + oft_R = self._pytorch_skew_symmetric(Q, self.block_size) + # scaling factor for each of the smaller block matrix + eps = eps * 1 / torch.sqrt(torch.tensor(oft_R.shape[0])) + I = ( # noqa: E741 + torch.zeros((oft_R.size(1), oft_R.size(1)), device=oft_R.device, dtype=oft_R.dtype) + .unsqueeze(0) + .expand_as(oft_R) + ) + diff = oft_R - I + norm_diff = torch.norm(oft_R - I, dim=(1, 2), keepdim=True) + mask = (norm_diff <= eps).bool() + out = torch.where(mask, oft_R, I + eps * (diff / norm_diff)) + + return self._pytorch_skew_symmetric_inv(out, self.block_size) + + # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155 + def _block_diagonal(self, oft_R: torch.Tensor, rank: int) -> torch.Tensor: + if oft_R.shape[0] == 1: + # block share + blocks = [oft_R[0, ...] for i in range(rank)] + else: + blocks = [oft_R[i, ...] for i in range(rank)] + + # Use torch.block_diag to create the block diagonal matrix + A = torch.block_diag(*blocks) + + return A + + def _unfold(self, x): + """ + Unfold with stride=1, padding=0 to preserve spatial dimensions. Only use kernel_size from base layer to define + patch size. + """ + batch_size, in_channels, in_height, in_width = x.shape + + if isinstance(self.kernel_size, int): + kernel_height, kernel_width = self.kernel_size, self.kernel_size + else: + kernel_height, kernel_width = self.kernel_size + + stride_h = stride_w = 1 + pad_h = pad_w = 0 + + # output dimensions + out_height = (in_height + 2 * pad_h - kernel_height) // stride_h + 1 + out_width = (in_width + 2 * pad_w - kernel_width) // stride_w + 1 + + # Reshape input from [B, C, H, W] to [B, C, H_out, W_out, K_H, K_W] + x_unfolded = x.unfold(2, kernel_height, stride_h).unfold(3, kernel_width, stride_w) + x_unfolded = x_unfolded.permute(0, 2, 3, 1, 4, 5).contiguous() + x_unfolded = x_unfolded.view(batch_size * out_height * out_width, -1) + + return x_unfolded + + def _fold(self, x_unfolded, orig_shape): + """ + Fold back to preserve spatial dimensions. + """ + batch_size, in_channels, in_height, in_width = orig_shape + + if isinstance(self.kernel_size, int): + kernel_height, kernel_width = self.kernel_size, self.kernel_size + else: + kernel_height, kernel_width = self.kernel_size + + # With stride=1, padding=0: + out_height = in_height - kernel_height + 1 + out_width = in_width - kernel_width + 1 + + # Reshape: [B*H_out*W_out, C*K_H*K_W] -> [B, H_out, W_out, C, K_H, K_W] + x_reshaped = x_unfolded.view(batch_size, out_height, out_width, in_channels, kernel_height, kernel_width) + + # Permute to: [B, C, H_out, W_out, K_H, K_W] + x_reshaped = x_reshaped.permute(0, 3, 1, 2, 4, 5).contiguous() + + # Use F.fold to reconstruct 4D tensor + x_folded = F.fold( + x_reshaped.view(batch_size, in_channels * kernel_height * kernel_width, out_height * out_width), + output_size=(in_height, in_width), + kernel_size=(kernel_height, kernel_width), + stride=(1, 1), + ) + + return x_folded + + def forward(self, x): + # This module doesn't need to implement the orthogonal transform + # It's primarily a container for the parameter + # The actual transformation logic stays in your OFTLayer + + required_dtype = x.dtype + if required_dtype != self.weight.dtype: + x = x.to(self.weight.dtype) + + orig_shape = x.shape + + if self.coft: + with torch.no_grad(): + self.weight.copy_(self._project_batch(self.weight, eps=self.eps)) + + orth_rotate = self._cayley_batch( + self.weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms + ) + + # Unfold the input for Conv2d layer + if len(orig_shape) == 4: + x = self._unfold(x) + + folded_shape = x.shape + rank = self.in_features // self.block_size if self.block_share else self.r + batch_dims = x.shape[:-1] + x_reshaped = x.reshape(*batch_dims, rank, self.block_size) + + if self.block_share: + orth_rotate = orth_rotate.repeat(rank, 1, 1) + x_rotated_reshaped = torch.einsum("...rk,rkc->...rc", x_reshaped, orth_rotate) + else: + x_rotated_reshaped = torch.einsum("...rk,rkc->...rc", x_reshaped, orth_rotate) + + x_rotated = x_rotated_reshaped.reshape(*folded_shape) + + if len(orig_shape) == 4: + x_rotated = self._fold(x_rotated, orig_shape) + + return x_rotated.to(required_dtype) + + def get_weight(self): + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + weight = self.weight + + if self.coft: + with torch.no_grad(): + weight = self._project_batch(weight, eps=self.eps) + self.weight.copy_(weight) + + orth_rotate = self._cayley_batch( + weight, self.block_size, self.use_cayley_neumann, self.num_cayley_neumann_terms + ) + + rank = self.r if not self.block_share else self.in_features // self.block_size + return self._block_diagonal(orth_rotate, rank) + + +class OFTLayer(BaseTunerLayer): + """ + Implements the OFT layer. + """ + + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names: tuple[str, ...] = ("oft_R",) + # All names of other parameters that may contain adapter-related parameters + other_param_names: tuple[str, ...] = ("r", "oft_block_size", "oft_dropout") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + """ + Initializes the OFT layer. + + Note, currently only support linear layer and convolutional layer, with further support for other layers to be + added soon. + + Parameters: + base_layer: the pretrained model layer + """ + self.base_layer = base_layer + self.oft_R = nn.ModuleDict({}) + self.oft_block_size = {} + self.r = {} + self.oft_block_size = {} + self.oft_dropout = nn.ModuleDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, nn.Conv2d): + in_features, out_features = base_layer.in_channels, base_layer.out_channels + elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"): + # QuantLinear + in_features, out_features = base_layer.infeatures, base_layer.outfeatures + elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"): + # Megatron ColumnParallelLinear,RowParallelLinear + in_features, out_features = base_layer.input_size, base_layer.output_size + elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear": + # AQLM QuantLinear + in_features, out_features = base_layer.in_features, base_layer.out_features + elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM": + # Awq layers + in_features, out_features = base_layer.in_features, base_layer.out_features + elif base_layer.__class__.__name__ == "EetqLinear": + # Eetq layers + in_features, out_features = base_layer.in_features, base_layer.out_features + elif hasattr(base_layer, "W_q") and base_layer.__class__.__name__ == "HQQLinear": + # HQQ layers + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + # possibly support user provided custom layer types using dynamic dispatch + if hasattr(base_layer, "in_features") and hasattr(base_layer, "out_features"): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + in_features, out_features = None, None + warnings.warn( + f"Unsupported layer type '{type(base_layer)}' encountered, proceed at your own risk.", UserWarning + ) + + self.in_features = in_features + self.out_features = out_features + + @property + def _available_adapters(self) -> set[str]: + return {*self.oft_R} + + def set_scale(self, adapter, scale): + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + + warnings.warn("Scaling operation for OFT not supported! Automatically set scale to 1.") + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + + warnings.warn("Scaling operation for OFT not supported! Automatically set scale to 1.") + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + + warnings.warn("Unscaling operation for OFT not supported! Keeping scale to 1.") + + def update_layer( + self, + adapter_name, + r, + oft_block_size, + module_dropout, + coft, + eps, + block_share, + init_weights, + use_cayley_neumann, + num_cayley_neumann_terms, + inference_mode: bool = False, + **kwargs, + ): + """ + Update the linear layer with trainable OFT weights. Override for other layer types. + """ + """Internal function to create oft adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + oft_block_size (`int`): The block size for added adapter. + module_dropout (`float`): + The multiplicative dropout probability for disabling adapter blocks during training. + coft (`bool`): Whether to use the constrained variant of OFT or not. + eps (`float`): + The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True. + block_share (`bool`): Whether to share the OFT parameters between blocks or not. + init_weights (`bool`): Whether to initialize weights. + """ + # Initialize the MultiplicativeDropoutLayer for module_dropout > 0.0. + if module_dropout > 0.0: + oft_dropout_layer = MultiplicativeDropoutLayer(p=module_dropout) + else: + oft_dropout_layer = nn.Identity() + self.oft_dropout.update(nn.ModuleDict({adapter_name: oft_dropout_layer})) + + if r == 0 and oft_block_size != 0: + if self.in_features % oft_block_size != 0 or oft_block_size > self.in_features: + old_oft_block_size = oft_block_size + oft_block_size = self.adjust_oft_parameters(self.in_features, oft_block_size) + warnings.warn( + f"Invalid `oft_block_size` ({old_oft_block_size})! Adjusted `oft_block_size` to ({oft_block_size})." + ) + r = int(self.in_features // oft_block_size) + elif r != 0 and oft_block_size == 0: + if self.in_features % r != 0 or r > self.in_features: + old_r = r + r = self.adjust_oft_parameters(self.in_features, r) + warnings.warn(f"Invalid `r` ({old_r})! Adjusted `r` to ({r}).") + oft_block_size = int(self.in_features // r) + else: + raise ValueError( + "Something went wrong, please report this error: https://github.com/huggingface/peft/issues" + ) + + # Create weights with provided shape + n_elements = oft_block_size * (oft_block_size - 1) // 2 + self.oft_R[adapter_name] = OFTRotationModule( + r if not block_share else 1, + n_elements, + oft_block_size, + self.in_features, + coft=coft, + eps=eps, + block_share=block_share, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + # Initialize weights + self.reset_oft_parameters(adapter_name, init_weights) + + # set oft r and block size + self.r[adapter_name] = r + self.oft_block_size[adapter_name] = oft_block_size + + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_oft_parameters(self, adapter_name, init_weights): + """ + Reset the OFT parameters. + """ + if init_weights is False: + nn.init.normal_(self.oft_R[adapter_name].weight, mean=0.0, std=0.1) + return + + if adapter_name in self.oft_R.keys(): + if init_weights is True: + # initialize oft_R to zero + nn.init.zeros_(self.oft_R[adapter_name].weight) + else: + raise ValueError(f"Unknown initialization {init_weights=}") + + def adjust_oft_parameters(self, in_features, params): + """ + Adjust the OFT parameters to be divisible by the in_features dimension. + """ + if params < in_features: + higher_params = params + while higher_params <= in_features and in_features % higher_params != 0: + higher_params += 1 + else: + return in_features + + lower_params = params + while lower_params > 1 and in_features % lower_params != 0: + lower_params -= 1 + + if (params - lower_params) <= (higher_params - params): + return lower_params + else: + return higher_params + + +class Linear(nn.Module, OFTLayer): + """OFT implemented in Linear layer""" + + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + module_dropout: float = 0.0, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: Union[bool, str] = True, + is_target_conv_1d_layer: bool = False, + **kwargs, + ) -> None: + super().__init__() + OFTLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + orig_weights = base_layer.weight.data + oft_mat = self.get_delta_weight(active_adapter) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat, orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights.contiguous().to(orig_dtype) + else: + orig_weights = base_layer.weight.data + oft_mat = self.get_delta_weight(active_adapter) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat, orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + + base_layer.weight.data = orig_weights.contiguous().to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.oft_R.keys(): + oft_mat = self.get_delta_weight(active_adapter) + + orig_weights = self.get_base_layer().weight.data + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat.t(), orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + + base_layer.weight.data = orig_weights.to(orig_dtype) + + def get_delta_weight(self, adapter_name) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + return self.oft_R[adapter_name].get_weight() + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + oft_R = self.oft_R[active_adapter] + + x = self._cast_input_dtype(x, oft_R.weight.dtype) + x = oft_R(x) + + result = self.base_layer(x.to(previous_dtype), *args, **kwargs) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +class Conv2d(nn.Module, OFTLayer): + """OFT implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 8, + oft_block_size: int = 0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + module_dropout: float = 0.0, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + init_weights: Union[bool, str] = True, + use_cayley_neumann: bool = False, + num_cayley_neumann_terms: int = 5, + **kwargs, + ) -> None: + super().__init__() + OFTLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + + # Create adapter and set it active + self.update_layer( + adapter_name, + r, + oft_block_size=oft_block_size, + module_dropout=module_dropout, + coft=coft, + eps=eps, + block_share=block_share, + init_weights=init_weights, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + def update_layer( + self, + adapter_name, + r, + oft_block_size, + module_dropout, + coft, + eps, + block_share, + init_weights, + use_cayley_neumann, + num_cayley_neumann_terms, + inference_mode: bool = False, + **kwargs, + ): + """ + Update the conv2d layer with trainable OFT weights. + """ + # Initialize the MultiplicativeDropoutLayer for module_dropout > 0.0. + if module_dropout > 0.0: + oft_dropout_layer = MultiplicativeDropoutLayer(p=module_dropout) + else: + oft_dropout_layer = nn.Identity() + self.oft_dropout.update(nn.ModuleDict({adapter_name: oft_dropout_layer})) + + # layer information from the base layer + base_layer = self.get_base_layer() + if base_layer.dilation[0] > 1: + raise ValueError("Conv2d with dilation > 1 is not supported by OFT.") + + conv_filter_dim = self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + + if r == 0 and oft_block_size != 0: + if conv_filter_dim % oft_block_size != 0 or oft_block_size > conv_filter_dim: + old_oft_block_size = oft_block_size + oft_block_size = self.adjust_oft_parameters(conv_filter_dim, oft_block_size) + warnings.warn( + f"Invalid `oft_block_size` ({old_oft_block_size})! Adjusted `oft_block_size` to ({oft_block_size})." + ) + r = int(conv_filter_dim // oft_block_size) + elif r != 0 and oft_block_size == 0: + if conv_filter_dim % r != 0 or r > conv_filter_dim: + old_r = r + r = self.adjust_oft_parameters(conv_filter_dim, r) + warnings.warn(f"Invalid `r` ({old_r})! Adjusted `r` to ({r}).") + oft_block_size = int(conv_filter_dim // r) + else: + raise ValueError( + "Something went wrong, please report this error: https://github.com/huggingface/peft/issues" + ) + + # Create weights with provided shape + n_elements = oft_block_size * (oft_block_size - 1) // 2 + self.oft_R[adapter_name] = OFTRotationModule( + r if not block_share else 1, + n_elements, + oft_block_size, + conv_filter_dim, + coft=coft, + eps=eps, + block_share=block_share, + kernel_size=base_layer.kernel_size, + use_cayley_neumann=use_cayley_neumann, + num_cayley_neumann_terms=num_cayley_neumann_terms, + ) + + # Initialize weights + self.reset_oft_parameters(adapter_name, init_weights) + + # set oft r and block size + self.r[adapter_name] = r + self.oft_block_size[adapter_name] = oft_block_size + + # Move new weights to device + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.oft_R.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + oft_mat = self.get_delta_weight(active_adapter) + + orig_weights = orig_weights.view( + self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + ) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat, orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = orig_weights.view( + self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0] + ) + + base_layer.weight.data = orig_weights.contiguous().to(orig_dtype) + else: + oft_mat = self.get_delta_weight(active_adapter) + + orig_weights = base_layer.weight.data.clone() + orig_weights = orig_weights.view( + self.out_features, self.in_features * base_layer.kernel_size[0] * base_layer.kernel_size[0] + ) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat, orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = orig_weights.view( + self.out_features, self.in_features, base_layer.kernel_size[0], base_layer.kernel_size[0] + ) + + base_layer.weight.data = orig_weights.contiguous().to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.oft_R.keys(): + oft_mat = self.get_delta_weight(active_adapter) + + orig_weights = self.get_base_layer().weight.data.clone() + orig_weights = orig_weights.view( + self.out_features, + self.in_features * self.get_base_layer().kernel_size[0] * self.get_base_layer().kernel_size[0], + ) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = torch.mm(oft_mat.t(), orig_weights.to(oft_mat.dtype)) + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = orig_weights.view( + self.out_features, + self.in_features, + self.get_base_layer().kernel_size[0], + self.get_base_layer().kernel_size[0], + ) + + base_layer.weight.data = orig_weights.to(orig_dtype) + + def get_delta_weight(self, adapter_name) -> tuple[torch.Tensor, torch.Tensor]: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + return self.oft_R[adapter_name].get_weight() + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + for active_adapter in self.active_adapters: + if active_adapter not in self.oft_R.keys(): + continue + + oft_R = self.oft_R[active_adapter] + x = self._cast_input_dtype(x, oft_R.weight.dtype) + x = oft_R(x) + + result = self.base_layer(x.to(previous_dtype), *args, **kwargs) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +def dispatch_default( + target: torch.nn.Module, + adapter_name: str, + oft_config: OFTConfig, + **kwargs, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Conv2d): + new_module = Conv2d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = oft_config.fan_in_fan_out = False + new_module = Linear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/oft/model.py b/peft/src/peft/tuners/oft/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8c3dc4336b4271c7002a9e2722e65c1988454832 --- /dev/null +++ b/peft/src/peft/tuners/oft/model.py @@ -0,0 +1,199 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import ( + BaseTuner, +) +from peft.utils import ( + TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING, + get_quantization_config, +) + +from .aqlm import dispatch_aqlm +from .awq import dispatch_awq +from .eetq import dispatch_eetq +from .gptq import dispatch_gptq +from .hqq import dispatch_hqq +from .inc import dispatch_inc +from .layer import OFTLayer, dispatch_default + + +class OFTModel(BaseTuner): + """ + Creates Orthogonal Finetuning model from a pretrained model. The method is described in + https://huggingface.co/papers/2306.07280 + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`OFTConfig`]): The configuration of the OFT model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The OFT model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import OFTModel, OFTConfig + + >>> config_te = OFTConfig( + ... r=8, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = OFTConfig( + ... r=8, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... module_dropout=0.0, + ... init_weights=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default") + >>> model.unet = OFTModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`OFTConfig`]): The configuration of the OFT model. + """ + + prefix: str = "oft_" + tuner_layer_cls = OFTLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + oft_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + kwargs = { + "r": oft_config.r, + "oft_block_size": oft_config.oft_block_size, + "module_dropout": oft_config.module_dropout, + "coft": oft_config.coft, + "eps": oft_config.eps, + "block_share": oft_config.block_share, + "use_cayley_neumann": oft_config.use_cayley_neumann, + "num_cayley_neumann_terms": oft_config.num_cayley_neumann_terms, + "fan_in_fan_out": oft_config.fan_in_fan_out, + "init_weights": oft_config.init_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + + quant_methods = ["gptq", "aqlm", "awq"] + for quant_method in quant_methods: + quantization_config = get_quantization_config(self.model, method=quant_method) + if quantization_config is not None: + kwargs[f"{quant_method}_quantization_config"] = quantization_config + + # If it is not a OFTLayer, create a new module, else update it with new adapters + if not isinstance(target, OFTLayer): + device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None + new_module = self._create_new_module(oft_config, adapter_name, target, device_map=device_map, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + r=oft_config.r, + oft_block_size=oft_config.oft_block_size, + module_dropout=oft_config.module_dropout, + coft=oft_config.coft, + eps=oft_config.eps, + block_share=oft_config.block_share, + use_cayley_neumann=oft_config.use_cayley_neumann, + num_cayley_neumann_terms=oft_config.num_cayley_neumann_terms, + init_weights=oft_config.init_weights, + ) + + @staticmethod + def _create_new_module(oft_config, adapter_name, target, **kwargs): + # Collect dispatcher functions to decide what backend to use for the replaced OFT layer. The order matters, + # because the first match is always used. Therefore, the default layers should be checked last. + dispatchers = [] + + # avoid eager bnb import + if is_bnb_available(): + from .bnb import dispatch_bnb_8bit + + dispatchers.append(dispatch_bnb_8bit) + + if is_bnb_4bit_available(): + from .bnb import dispatch_bnb_4bit + + dispatchers.append(dispatch_bnb_4bit) + + dispatchers.extend( + [ + dispatch_eetq, + dispatch_aqlm, + dispatch_awq, + dispatch_gptq, + dispatch_hqq, + dispatch_inc, + dispatch_default, + ] + ) + + new_module = None + for dispatcher in dispatchers: + new_module = dispatcher(target, adapter_name, oft_config=oft_config, **kwargs) + if new_module is not None: # first match wins + break + + if new_module is None: + # no module could be matched + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `torch.nn.Conv2d`." + ) + + return new_module + + def _check_merge_allowed(self): + """Verify that the configuration supports merging. + + Currently gptq quantization and replicated layers do not support merging. + """ + super()._check_merge_allowed() + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge OFT layers when the model is gptq quantized") + if self.peft_config.get("layer_replication"): + raise ValueError("Cannot merge OFT layers when base model layers are replicated") diff --git a/peft/src/peft/tuners/p_tuning/__init__.py b/peft/src/peft/tuners/p_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9195c0d75d3d11e752d0477b64edd79599bdaa01 --- /dev/null +++ b/peft/src/peft/tuners/p_tuning/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import PromptEncoderConfig, PromptEncoderReparameterizationType +from .model import PromptEncoder + + +__all__ = ["PromptEncoder", "PromptEncoderConfig", "PromptEncoderReparameterizationType"] + +register_peft_method(name="p_tuning", config_cls=PromptEncoderConfig, model_cls=PromptEncoder) diff --git a/peft/src/peft/tuners/p_tuning/config.py b/peft/src/peft/tuners/p_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a69c13db9c8a0f57a7daa7d312472625251fb6c8 --- /dev/null +++ b/peft/src/peft/tuners/p_tuning/config.py @@ -0,0 +1,60 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Union + +from peft.config import PromptLearningConfig +from peft.utils import PeftType + + +class PromptEncoderReparameterizationType(str, enum.Enum): + MLP = "MLP" + LSTM = "LSTM" + + +@dataclass +class PromptEncoderConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PromptEncoder`]. + + Args: + encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]): + The type of reparameterization to use. + encoder_hidden_size (`int`): The hidden size of the prompt encoder. + encoder_num_layers (`int`): The number of layers of the prompt encoder. + encoder_dropout (`float`): The dropout probability of the prompt encoder. + """ + + encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field( + default=PromptEncoderReparameterizationType.MLP, + metadata={"help": "How to reparameterize the prompt encoder"}, + ) + encoder_hidden_size: int = field( + default=None, + metadata={"help": "The hidden size of the prompt encoder"}, + ) + encoder_num_layers: int = field( + default=2, + metadata={"help": "The number of layers of the prompt encoder"}, + ) + encoder_dropout: float = field( + default=0.0, + metadata={"help": "The dropout of the prompt encoder"}, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.P_TUNING diff --git a/peft/src/peft/tuners/p_tuning/model.py b/peft/src/peft/tuners/p_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ade2b1128158376c134441687803b85d444cfb96 --- /dev/null +++ b/peft/src/peft/tuners/p_tuning/model.py @@ -0,0 +1,130 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py +# with some refactor +import warnings + +import torch + +from .config import PromptEncoderConfig, PromptEncoderReparameterizationType + + +class PromptEncoder(torch.nn.Module): + """ + The prompt encoder network that is used to generate the virtual token embeddings for p-tuning. + + Args: + config ([`PromptEncoderConfig`]): The configuration of the prompt encoder. + + Example: + + ```py + >>> from peft import PromptEncoder, PromptEncoderConfig + + >>> config = PromptEncoderConfig( + ... peft_type="P_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... encoder_reparameterization_type="MLP", + ... encoder_hidden_size=768, + ... ) + + >>> prompt_encoder = PromptEncoder(config) + ``` + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt encoder. + - **mlp_head** (`torch.nn.Sequential`) -- The MLP head of the prompt encoder if `inference_mode=False`. + - **lstm_head** (`torch.nn.LSTM`) -- The LSTM head of the prompt encoder if `inference_mode=False` and + `encoder_reparameterization_type="LSTM"`. + - **token_dim** (`int`) -- The hidden embedding dimension of the base transformer model. + - **input_size** (`int`) -- The input size of the prompt encoder. + - **output_size** (`int`) -- The output size of the prompt encoder. + - **hidden_size** (`int`) -- The hidden size of the prompt encoder. + - **total_virtual_tokens** (`int`): The total number of virtual tokens of the + prompt encoder. + - **encoder_type** (Union[[`PromptEncoderReparameterizationType`], `str`]): The encoder type of the prompt + encoder. + + + Input shape: (`batch_size`, `total_virtual_tokens`) + + Output shape: (`batch_size`, `total_virtual_tokens`, `token_dim`) + """ + + def __init__(self, config): + super().__init__() + self.token_dim = config.token_dim + self.input_size = self.token_dim + self.output_size = self.token_dim + self.hidden_size = config.encoder_hidden_size + self.total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules + self.encoder_type = config.encoder_reparameterization_type + + # embedding + self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim) + if not config.inference_mode: + if self.encoder_type == PromptEncoderReparameterizationType.LSTM: + lstm_dropout = config.encoder_dropout + num_layers = config.encoder_num_layers + # LSTM + self.lstm_head = torch.nn.LSTM( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=num_layers, + dropout=lstm_dropout, + bidirectional=True, + batch_first=True, + ) + + self.mlp_head = torch.nn.Sequential( + torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size * 2, self.output_size), + ) + + elif self.encoder_type == PromptEncoderReparameterizationType.MLP: + encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers + if config.encoder_num_layers != encoder_num_layers_default: + warnings.warn( + f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. " + f"Exactly {encoder_num_layers_default} MLP layers are used." + ) + layers = [ + torch.nn.Linear(self.input_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.output_size), + ] + self.mlp_head = torch.nn.Sequential(*layers) + + else: + raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.") + + def forward(self, indices): + input_embeds = self.embedding(indices) + if self.encoder_type == PromptEncoderReparameterizationType.LSTM: + output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0]) + elif self.encoder_type == PromptEncoderReparameterizationType.MLP: + output_embeds = self.mlp_head(input_embeds) + else: + raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.") + + return output_embeds diff --git a/peft/src/peft/tuners/poly/__init__.py b/peft/src/peft/tuners/poly/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1c18933eba3fa44106ba9fa89ba34ecd12a2bed4 --- /dev/null +++ b/peft/src/peft/tuners/poly/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import PolyConfig +from .layer import Linear, PolyLayer +from .model import PolyModel + + +__all__ = ["Linear", "PolyConfig", "PolyLayer", "PolyModel"] + +register_peft_method(name="poly", config_cls=PolyConfig, model_cls=PolyModel) diff --git a/peft/src/peft/tuners/poly/config.py b/peft/src/peft/tuners/poly/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c4a77bc5db447edd4ba97c1b1c407f3cfc620cb4 --- /dev/null +++ b/peft/src/peft/tuners/poly/config.py @@ -0,0 +1,103 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class PolyConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`PolyModel`]. + - [Polytropon (Poly)](https://huggingface.co/papers/2202.13914) + - [Multi-Head Routing (MHR)](https://huggingface.co/papers/2211.03831) + + Args: + r (`int`): Attention dimension of each Lora in Poly. + target_modules (`Union[List[str],str]`): The names of the modules to apply Poly to. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + modules_to_save (`List[str]`): List of modules apart from Poly layers to be set as trainable + and saved in the final checkpoint. + init_weights (bool): Whether to perform initialization of Poly weights. + poly_type (`Literal["poly"]`): The variant of the Poly module to use. Currently, only "poly" + is supported. + n_tasks (`int`): The number of tasks in a multitasking scenario. + n_skills (`int`): The number of skills (LoRA) in each Poly layer. + n_splits (`int`): The number of splits within each LoRA of a Poly layer. A value greater + than 1 indicates the use of Multi-Head Routing (MHR). + """ + + r: int = field(default=8, metadata={"help": "Lora attention dimension"}) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with Poly." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from Poly."}, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from Poly layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the Poly layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + poly_type: Literal["poly"] = field( + default="poly", + metadata={"help": 'Type of Poly modules to be used. Currently only "poly" is supported.'}, + ) + n_tasks: int = field( + default=1, + metadata={"help": "Number of tasks in multitasking scenario."}, + ) + n_skills: int = field( + default=4, + metadata={"help": "Number of skills (LoRA) in each Poly layer."}, + ) + n_splits: int = field( + default=1, + metadata={"help": "Number of splits within each LoRA of a Poly layer."}, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.POLY + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) diff --git a/peft/src/peft/tuners/poly/layer.py b/peft/src/peft/tuners/poly/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..2f700997a54f247f4cf77172700d40b54e6c2600 --- /dev/null +++ b/peft/src/peft/tuners/poly/layer.py @@ -0,0 +1,165 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any + +import torch +import torch.nn as nn + +from peft.tuners.tuners_utils import BaseTunerLayer + +from .config import PolyConfig +from .router import get_router + + +class PolyLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("poly_lora_A", "poly_lora_B", "poly_router") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "n_tasks", "n_skills", "n_splits") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.n_tasks = {} + self.n_skills = {} + self.n_splits = {} + self.poly_type = {} + self.poly_router = nn.ModuleDict() + self.poly_lora_A = nn.ParameterDict() + self.poly_lora_B = nn.ParameterDict() + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + self.in_features = in_features + self.out_features = out_features + + def update_layer(self, adapter_name, poly_config, inference_mode: bool = False, **kwargs): + if poly_config.r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {poly_config.r}") + + self.r[adapter_name] = poly_config.r + self.n_tasks[adapter_name] = poly_config.n_tasks + self.n_skills[adapter_name] = poly_config.n_skills + self.n_splits[adapter_name] = poly_config.n_splits + self.poly_type[adapter_name] = poly_config.poly_type + + self.poly_lora_A[adapter_name] = nn.Parameter( + torch.empty( + poly_config.n_splits, + poly_config.n_skills, + self.in_features // poly_config.n_splits, + poly_config.r, + ) + ) + self.poly_lora_B[adapter_name] = nn.Parameter( + torch.empty( + poly_config.n_splits, + poly_config.n_skills, + poly_config.r, + self.out_features // poly_config.n_splits, + ) + ) + self.poly_router[adapter_name] = get_router(poly_config) + + self.reset_poly_parameters(adapter_name, init_weights=poly_config.init_weights) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_poly_parameters(self, adapter_name, init_weights): + if adapter_name in self.poly_lora_A.keys(): + # initialize A the same way as the default for nn.Linear + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L269 + n_splits, n_skills, d, r = self.poly_lora_A[adapter_name].shape + for skill in range(n_skills): + for split in range(n_splits): + param = torch.empty((r, d)) + torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5)) + self.poly_lora_A[adapter_name].data[split, skill, :, :] = param.T + + if init_weights: + # initialize B to zero + torch.nn.init.zeros_(self.poly_lora_B[adapter_name]) + else: + # initialize B the same way as the default for nn.Linear + n_splits, n_skills, r, d = self.poly_lora_B[adapter_name].shape + for skill in range(n_skills): + for split in range(n_splits): + param = torch.empty((d, r)) + torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5)) + self.poly_lora_B[adapter_name].data[split, skill, :, :] = param.T + + # initialized router + self.poly_router[adapter_name].reset() + + +class Linear(nn.Module, PolyLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + poly_config: PolyConfig, + **kwargs, + ) -> None: + super().__init__() + PolyLayer.__init__(self, base_layer, **kwargs) + + self._active_adapter = adapter_name + self.update_layer(adapter_name, poly_config) + + def forward(self, x: torch.Tensor, *args: Any, task_ids: torch.Tensor = None, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + if self.disable_adapters: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.poly_lora_A.keys(): + continue + + r = self.r[active_adapter] + poly_router = self.poly_router[active_adapter] + poly_lora_A = self.poly_lora_A[active_adapter] + poly_lora_B = self.poly_lora_B[active_adapter] + + # Combine the output of LoRAs + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L293 + mixing_weights = poly_router(task_ids=task_ids, input_ids=x) + bs, n_splits, n_skills = mixing_weights.size() + + # A is n_splits, n_skills, D // n_splits, rank + # we want bs, n_splits, D // n_splits, rank + A = torch.einsum("bqs,qsdr->bqdr", (mixing_weights, poly_lora_A)) + B = torch.einsum("bqs,qsrd->bqrd", (mixing_weights, poly_lora_B)) + + A = A.reshape(bs, self.in_features, r) + B = B.transpose(1, 2).reshape(bs, r, self.out_features) + + x = x.to(A.dtype) + result += x.bmm(A).bmm(B) / r + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "poly." + rep diff --git a/peft/src/peft/tuners/poly/model.py b/peft/src/peft/tuners/poly/model.py new file mode 100644 index 0000000000000000000000000000000000000000..bf7060026310a1f489512293b3afa216105a16cc --- /dev/null +++ b/peft/src/peft/tuners/poly/model.py @@ -0,0 +1,104 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import contextmanager +from typing import Any + +import torch +from torch import nn + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING + +from .config import PolyConfig +from .layer import Linear, PolyLayer + + +class PolyModel(BaseTuner): + prefix: str = "poly_" + tuner_layer_cls = PolyLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + poly_config: PolyConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + **optional_kwargs: Any, + ): + if isinstance(target, PolyLayer): + target.update_layer(adapter_name, poly_config) + else: + new_module = self._create_new_module( + poly_config, + adapter_name, + target, + ) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(poly_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + return Linear(target, adapter_name, poly_config, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + def _register_pre_hooks(self, task_ids): + """Helper method to register pre hooks.""" + if task_ids is None: + return [] + + def pre_hook(_, args, kwargs): + kwargs["task_ids"] = task_ids + return args, kwargs + + handles = [] + + for module in self.model.modules(): + if isinstance(module, Linear): + handle = module.register_forward_pre_hook(pre_hook, with_kwargs=True) + handles.append(handle) + + return handles + + @contextmanager + def _manage_pre_hooks(self, task_ids): + """Context manager to handle the lifecycle of pre hooks.""" + handles = self._register_pre_hooks(task_ids) + try: + yield + finally: + for handle in handles: + handle.remove() + + def forward(self, *args, task_ids=None, **kwargs): + with self._manage_pre_hooks(task_ids): + return self.model(*args, **kwargs) + + def generate(self, *args, task_ids=None, **kwargs): + with self._manage_pre_hooks(task_ids): + return self.model.generate(*args, **kwargs) diff --git a/peft/src/peft/tuners/poly/router.py b/peft/src/peft/tuners/poly/router.py new file mode 100644 index 0000000000000000000000000000000000000000..3dda3e75e35b6a9fbd5a2412815a0f05421f2ef4 --- /dev/null +++ b/peft/src/peft/tuners/poly/router.py @@ -0,0 +1,81 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod + +import torch +from torch import nn +from torch.distributions.relaxed_bernoulli import RelaxedBernoulli + +from .config import PolyConfig + + +EPS = 1e-12 + + +def get_router(poly_config: PolyConfig) -> nn.Module: + if poly_config.poly_type == "poly": + return PolyRouter(poly_config) + else: + raise ValueError( + f"Unsupported poly_type: {poly_config.poly_type}. " + "Currently, only the following types are supported: " + "`poly`." + ) + + +class Router(nn.Module, ABC): + @abstractmethod + def reset(self): ... + + @abstractmethod + def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor): ... + + +class PolyRouter(Router): + # It's a simplified implementation of + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L138 + def __init__(self, poly_config: PolyConfig): + super().__init__() + + self.poly_type = poly_config.poly_type + self.n_tasks = poly_config.n_tasks + self.n_skills = poly_config.n_skills + self.n_splits = poly_config.n_splits + + self.module_logits = nn.Parameter(torch.empty((self.n_tasks, self.n_splits * self.n_skills))) + + def reset(self): + torch.nn.init.uniform_(self.module_logits, -1e-3, 1e-3) + + def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor): + if task_ids is None: + raise ValueError("task_ids should not be None.") + if task_ids.max().item() >= self.n_tasks: + raise ValueError(f"Only {self.n_tasks} tasks available. Found task id = {task_ids.max().item()}") + + # move task id to input's device + task_ids = task_ids.to(self.module_logits.device) + + module_logits = self.module_logits[task_ids] + module_logits = module_logits.view(-1, self.n_splits, self.n_skills) + + if self.training: + module_logits = RelaxedBernoulli(temperature=1.0, logits=module_logits).rsample() + else: + module_logits = torch.sigmoid(module_logits) + + module_weights = module_logits / (module_logits.sum(dim=-1, keepdim=True) + EPS) + + return module_weights diff --git a/peft/src/peft/tuners/prefix_tuning/__init__.py b/peft/src/peft/tuners/prefix_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..939f74d3f689f400dfdcb0139f4a2cf04cce52fc --- /dev/null +++ b/peft/src/peft/tuners/prefix_tuning/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import PrefixTuningConfig +from .model import PrefixEncoder + + +__all__ = ["PrefixEncoder", "PrefixTuningConfig"] + +register_peft_method(name="prefix_tuning", config_cls=PrefixTuningConfig, model_cls=PrefixEncoder) diff --git a/peft/src/peft/tuners/prefix_tuning/config.py b/peft/src/peft/tuners/prefix_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6eed77167a6e0b928e59e7e07fbf842c2d7a2d83 --- /dev/null +++ b/peft/src/peft/tuners/prefix_tuning/config.py @@ -0,0 +1,42 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from peft.config import PromptLearningConfig +from peft.utils import PeftType + + +@dataclass +class PrefixTuningConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PrefixEncoder`]. + + Args: + encoder_hidden_size (`int`): The hidden size of the prompt encoder. + prefix_projection (`bool`): Whether to project the prefix embeddings. + """ + + encoder_hidden_size: int = field( + default=None, + metadata={"help": "The hidden size of the encoder"}, + ) + prefix_projection: bool = field( + default=False, + metadata={"help": "Whether to project the prefix tokens"}, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.PREFIX_TUNING diff --git a/peft/src/peft/tuners/prefix_tuning/model.py b/peft/src/peft/tuners/prefix_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd51892a3cc074406791f6bc7d1b088d25148e3 --- /dev/null +++ b/peft/src/peft/tuners/prefix_tuning/model.py @@ -0,0 +1,80 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py +# with some refactor +import torch + + +class PrefixEncoder(torch.nn.Module): + r""" + The `torch.nn` model to encode the prefix. + + Args: + config ([`PrefixTuningConfig`]): The configuration of the prefix encoder. + + Example: + + ```py + >>> from peft import PrefixEncoder, PrefixTuningConfig + + >>> config = PrefixTuningConfig( + ... peft_type="PREFIX_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... encoder_hidden_size=768, + ... ) + >>> prefix_encoder = PrefixEncoder(config) + ``` + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prefix encoder. + - **transform** (`torch.nn.Sequential`) -- The two-layer MLP to transform the prefix embeddings if + `prefix_projection` is `True`. + - **prefix_projection** (`bool`) -- Whether to project the prefix embeddings. + + Input shape: (`batch_size`, `num_virtual_tokens`) + + Output shape: (`batch_size`, `num_virtual_tokens`, `2*layers*hidden`) + """ + + def __init__(self, config): + super().__init__() + self.prefix_projection = config.prefix_projection + token_dim = config.token_dim + num_layers = config.num_layers + encoder_hidden_size = config.encoder_hidden_size + num_virtual_tokens = config.num_virtual_tokens + if self.prefix_projection and not config.inference_mode: + # Use a two-layer MLP to encode the prefix + self.embedding = torch.nn.Embedding(num_virtual_tokens, token_dim) + self.transform = torch.nn.Sequential( + torch.nn.Linear(token_dim, encoder_hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(encoder_hidden_size, num_layers * 2 * token_dim), + ) + else: + self.embedding = torch.nn.Embedding(num_virtual_tokens, num_layers * 2 * token_dim) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.transform(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values diff --git a/peft/src/peft/tuners/prompt_tuning/__init__.py b/peft/src/peft/tuners/prompt_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c99ca6a26fea22e3d829c16eec378e82633e1b7b --- /dev/null +++ b/peft/src/peft/tuners/prompt_tuning/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import PromptTuningConfig, PromptTuningInit +from .model import PromptEmbedding + + +__all__ = ["PromptEmbedding", "PromptTuningConfig", "PromptTuningInit"] + +register_peft_method(name="prompt_tuning", config_cls=PromptTuningConfig, model_cls=PromptEmbedding) diff --git a/peft/src/peft/tuners/prompt_tuning/config.py b/peft/src/peft/tuners/prompt_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b41669efe898e88dfd015042e0c78258fb9b3a14 --- /dev/null +++ b/peft/src/peft/tuners/prompt_tuning/config.py @@ -0,0 +1,91 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PromptLearningConfig +from peft.utils import PeftType + + +class PromptTuningInit(str, enum.Enum): + TEXT = "TEXT" + SAMPLE_VOCAB = "SAMPLE_VOCAB" + RANDOM = "RANDOM" + + +@dataclass +class PromptTuningConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PromptEmbedding`]. + + Args: + prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): + The initialization of the prompt embedding. `TEXT` will initialize with your text. `SAMPLE_VOCAB` will + initialize with randomly sampled tokens from the model's vocabulary. `RANDOM` will initialize with randomly + sampled continuous, soft tokens (warning: sampled soft tokens may fall outside of embedding manifold) + prompt_tuning_init_text (`str`, *optional*): + The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`. + tokenizer_name_or_path (`str`, *optional*): + The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`. + tokenizer_kwargs (`dict`, *optional*): + The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if `prompt_tuning_init` is + `TEXT`. + """ + + prompt_tuning_init: Union[PromptTuningInit, str] = field( + default=PromptTuningInit.RANDOM, + metadata={"help": "How to initialize the prompt tuning parameters"}, + ) + prompt_tuning_init_text: Optional[str] = field( + default=None, + metadata={ + "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`" + }, + ) + tokenizer_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`" + }, + ) + + tokenizer_kwargs: Optional[dict] = field( + default=None, + metadata={ + "help": ( + "The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if prompt_tuning_init is " + "`TEXT`" + ), + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.PROMPT_TUNING + if (self.prompt_tuning_init == PromptTuningInit.TEXT) and not self.tokenizer_name_or_path: + raise ValueError( + f"When prompt_tuning_init='{PromptTuningInit.TEXT.value}', " + f"tokenizer_name_or_path can't be {self.tokenizer_name_or_path}." + ) + if (self.prompt_tuning_init == PromptTuningInit.TEXT) and self.prompt_tuning_init_text is None: + raise ValueError( + f"When prompt_tuning_init='{PromptTuningInit.TEXT.value}', " + f"prompt_tuning_init_text can't be {self.prompt_tuning_init_text}." + ) + if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT): + raise ValueError( + f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT.value}'." + ) diff --git a/peft/src/peft/tuners/prompt_tuning/model.py b/peft/src/peft/tuners/prompt_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9852ea28b4583b273729799e5b2b3fcad9c72ee7 --- /dev/null +++ b/peft/src/peft/tuners/prompt_tuning/model.py @@ -0,0 +1,102 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch + +from peft.utils.integrations import gather_params_ctx + +from .config import PromptTuningInit + + +class PromptEmbedding(torch.nn.Module): + """ + The model to encode virtual tokens into prompt embeddings. + + Args: + config ([`PromptTuningConfig`]): The configuration of the prompt embedding. + word_embeddings (`torch.nn.Module`): The word embeddings of the base transformer model. + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt embedding. + + Example: + + ```py + >>> from peft import PromptEmbedding, PromptTuningConfig + + >>> config = PromptTuningConfig( + ... peft_type="PROMPT_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... prompt_tuning_init="TEXT", + ... prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral", + ... tokenizer_name_or_path="t5-base", + ... ) + + >>> # t5_model.shared is the word embeddings of the base model + >>> prompt_embedding = PromptEmbedding(config, t5_model.shared) + ``` + + Input Shape: (`batch_size`, `total_virtual_tokens`) + + Output Shape: (`batch_size`, `total_virtual_tokens`, `token_dim`) + """ + + def __init__(self, config, word_embeddings): + super().__init__() + + total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules + self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim) + if config.prompt_tuning_init == PromptTuningInit.SAMPLE_VOCAB and not config.inference_mode: + # Randomly sample tokens from the tokenizer's vocab + vocab_size = word_embeddings.num_embeddings + init_token_ids = torch.randint(0, vocab_size, (total_virtual_tokens,), dtype=torch.long).to( + word_embeddings.weight.device + ) + with gather_params_ctx(word_embeddings.parameters()): + word_embedding_weights = word_embeddings(init_token_ids).detach().clone() + word_embedding_weights = word_embedding_weights.to(torch.float32) + self.embedding.weight = torch.nn.Parameter(word_embedding_weights) + + elif config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode: + from transformers import AutoTokenizer + + tokenizer_kwargs = config.tokenizer_kwargs or {} + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path, **tokenizer_kwargs) + init_text = config.prompt_tuning_init_text + init_token_ids = tokenizer(init_text)["input_ids"] + # Trim or iterate until num_text_tokens matches total_virtual_tokens + num_text_tokens = len(init_token_ids) + if num_text_tokens > total_virtual_tokens: + init_token_ids = init_token_ids[:total_virtual_tokens] + elif num_text_tokens < total_virtual_tokens: + num_reps = math.ceil(total_virtual_tokens / num_text_tokens) + init_token_ids = init_token_ids * num_reps + init_token_ids = init_token_ids[:total_virtual_tokens] + init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device) + with gather_params_ctx(word_embeddings.parameters()): + word_embedding_weights = word_embeddings(init_token_ids).detach().clone() + word_embedding_weights = word_embedding_weights.to(torch.float32) + self.embedding.weight = torch.nn.Parameter(word_embedding_weights) + + def forward(self, indices): + # Just get embeddings + prompt_embeddings = self.embedding(indices) + return prompt_embeddings diff --git a/peft/src/peft/tuners/randlora/__init__.py b/peft/src/peft/tuners/randlora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fbad681aeb0231254f5caae6b9bf9aa3a2c76ef0 --- /dev/null +++ b/peft/src/peft/tuners/randlora/__init__.py @@ -0,0 +1,40 @@ +# Copyright 2025-present the HuggingFace Inc. team. + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.utils import register_peft_method + +from .config import RandLoraConfig +from .layer import Linear, RandLoraLayer +from .model import RandLoraModel + + +__all__ = ["Linear", "RandLoraConfig", "RandLoraLayer", "RandLoraModel"] + +register_peft_method(name="randlora", config_cls=RandLoraConfig, model_cls=RandLoraModel, prefix="randlora_") + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/randlora/bnb.py b/peft/src/peft/tuners/randlora/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..353295e88aab41f4ca59669ec51e4dc479ad1e3e --- /dev/null +++ b/peft/src/peft/tuners/randlora/bnb.py @@ -0,0 +1,456 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Optional + +import bitsandbytes as bnb +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import check_adapters_to_merge +from peft.utils.integrations import dequantize_bnb_weight +from peft.utils.other import transpose + +from .layer import RandLoraLayer, UniqueBaseGrad + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, RandLoraLayer): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + randlora_A, + randlora_B, + r: int = 0, + randlora_alpha: int = 0, + randlora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + RandLoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + randlora_A, + randlora_B, + r, + randlora_alpha=randlora_alpha, + randlora_dropout=randlora_dropout, + init_weights=init_weights, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + for active_adapter in adapter_names: + if active_adapter not in self.randlora_lambda.keys(): + continue + + warnings.warn( + "Merge RandLora module to 8-bit linear may get different generations due to rounding errors." + ) + randlora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + output = dequantize_bnb_weight(weight, state) + w_data = output.to(randlora_data.dtype).to(randlora_data.device) + randlora_data + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.randlora_lambda.keys(): + continue + warnings.warn( + "Unmerge randlora module to 8-bit linear may get different generations due to rounding errors." + ) + randlora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + output = dequantize_bnb_weight(weight, state=state) + + w_data = output.to(randlora_data.dtype).to(randlora_data.device) - randlora_data + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + + def get_scaled_bases(self, adapter, device=None) -> list[torch.Tensor, torch.Tensor]: + """ + Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the + correct order to fit the target layers' dimensions + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + randlora_A = self.randlora_A[adapter] + randlora_B = self.randlora_B[adapter] + + if device is None: + device = randlora_B.device + dtype = randlora_B.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + randlora_lambda = self.randlora_lambda[adapter].to(device) + randlora_gamma = self.randlora_gamma[adapter].to(device) + + if cast_to_fp32: + randlora_A = randlora_A.float() + randlora_B = randlora_B.float() + randlora_lambda = randlora_lambda.float() + randlora_gamma = randlora_gamma.float() + + # The trainable parameters are always applied to randlora_A, the smallest basis. + min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features) + + # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices, + # we initialize these matrices with the largest required size for each dimension. + # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B. + sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device) + sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device) + + # Flattening the matrices over the rank and number of bases dimensions is more memory efficient + update_B = sliced_B.flatten(start_dim=1) + update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1) + if min_dim == self.in_features: + return update_A, update_B + + return update_B.T, update_A.T + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + update_B, update_A = self.get_scaled_bases(adapter) + + update = update_B @ update_A + output_tensor = transpose(update, self.fan_in_fan_out) + + scaling = self.scaling[adapter] + + return output_tensor * scaling + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + """ + Perform the forward pass using the RandLora adapter. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Output tensor after applying the RandLora adaptation. + + Note: + This method implements the RandLora-specific forward pass. It applies the shared projections + (randlora_A and randlora_B) along with the per-layer trainable parameters (lambda and gamma) to compute + the adapter output. + """ + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.randlora_lambda.keys(): + continue + + update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device) + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = update_A.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + + dropout = self.randlora_dropout[active_adapter] + x_temp = dropout(x.to(update_A.dtype)) + + adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A) + + if requires_conversion: + adapter_output = adapter_output.to(expected_dtype) + + scaling = self.scaling[active_adapter] + result = result + adapter_output * scaling + + # Ensure the output tensor has the same dtype as the input tensor + return result.to(x.dtype) + + def __repr__(self) -> str: + rep = super().__repr__() + return "randlora." + rep + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, RandLoraLayer): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + randlora_A, + randlora_B, + r: int = 0, + randlora_alpha: int = 0, + randlora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + RandLoraLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + randlora_A, + randlora_B, + r, + randlora_alpha=randlora_alpha, + randlora_dropout=randlora_dropout, + init_weights=init_weights, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + for active_adapter in adapter_names: + if active_adapter not in self.randlora_lambda.keys(): + continue + + warnings.warn( + "Merge RandLora module to 4-bit linear may get different generations due to rounding errors." + ) + randlora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + randlora_data + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.randlora_lambda.keys(): + continue + warnings.warn( + "Unmerge RandLora module to 4-bit linear may get different generations due to rounding errors." + ) + randlora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - randlora_data + + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + + def get_scaled_bases(self, adapter, device=None) -> list[torch.Tensor, torch.Tensor]: + """ + Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the + correct order to fit the target layers' dimensions + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + randlora_A = self.randlora_A[adapter] + randlora_B = self.randlora_B[adapter] + if device is None: + device = randlora_B.device + dtype = randlora_B.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + randlora_lambda = self.randlora_lambda[adapter].to(device) + randlora_gamma = self.randlora_gamma[adapter].to(device) + + if cast_to_fp32: + randlora_A = randlora_A.float() + randlora_B = randlora_B.float() + randlora_lambda = randlora_lambda.float() + randlora_gamma = randlora_gamma.float() + + # The trainable parameters are always applied to randlora_A, the smallest basis. + min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features) + + # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices, + # we initialize these matrices with the largest required size for each dimension. + # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B. + sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device) + sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device) + # Flattening the matrices over the rank and number of bases dimensions is more memory efficient + update_B = sliced_B.flatten(start_dim=1) + update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1) + if min_dim == self.in_features: + return update_A, update_B + + return update_B.T, update_A.T + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + update_B, update_A = self.get_scaled_bases(adapter) + + update = update_B @ update_A + output_tensor = transpose(update, self.fan_in_fan_out) + + scaling = self.scaling[adapter] + + return output_tensor * scaling + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result = result.clone() + for active_adapter in self.active_adapters: + if active_adapter not in self.randlora_lambda.keys(): + continue + + update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device) + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = update_A.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + + dropout = self.randlora_dropout[active_adapter] + x_temp = dropout(x.to(update_A.dtype)) + + adapter_output = torch.nn.functional.linear(torch.nn.functional.linear(x_temp, update_B), update_A) + + if requires_conversion: + adapter_output = adapter_output.to(expected_dtype) + + scaling = self.scaling[active_adapter] + result = result + adapter_output * scaling + + # Ensure the output tensor has the same dtype as the input tensor + return result.to(x.dtype) + + def __repr__(self) -> str: + rep = super().__repr__() + return "randlora." + rep diff --git a/peft/src/peft/tuners/randlora/config.py b/peft/src/peft/tuners/randlora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..b194b974331dd9abcb5777295ae4e79d8b81da56 --- /dev/null +++ b/peft/src/peft/tuners/randlora/config.py @@ -0,0 +1,199 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class RandLoraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`RandLoraModel`]. + + Paper: https://huggingface.co/papers/2502.00987. + + Args: + r (`int`, *optional*, defaults to `32`): + RandLora's random basis rank dimension. Contrary to Lora, this parameter is inversely proportional to the + amount of trainable parameters as reducing it increases trainable parameters. + target_modules (`Union[list[str], str]`): + The names of the modules to apply RandLora to. Only linear layers are supported. + projection_prng_key (`int`): + RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a + checkpoint that did not include these projections. Defaults to `0`. + save_projection (`bool`): + Whether to save the global basis_A / basis_B random basis in the state dict alongside per layer lambda / + gamma diagonal matrices. This will increase the size of the checkpoint, but guarantee that we can reload + the checkpoint on all system configurations. Defaults to `True`. + sparse (`bool`): + Whether to use sparse random bases as described in the RandLora paper. The bases are ternary sparse bases + (only containing -1, 0 and 1) where the attribution probability is 1/6 for -1 and 1 and 2/3 for 0. These + sparse matrices aim to be used for matmul free computation in the future, see + https://huggingface.co/papers/2406.02528v1 The current implementation is a proof of concept however where + the sparseness is not used to improve speed or memory usage. Using sparse matrices typically does not + reduce performance and can even help reduce overfitting. Defaults to `False`. + very_sparse (`bool`): + Whether to use highly sparse random bases as described in the RandLora paper. The very sparse bases are + ternary sparse bases (only containing -1, 0 and 1) given a matrix with smallest dimension d, the + attribution probability is 1/√D for -1 and 1 and 1- 2/√D for 0. Using these sparse matrices can further + reduce overfitting over the `sparse` alternatives but will most likely decrease performance as a results. + Use carefully. Defaults to `False`. + randlora_dropout (`float`): + The dropout probability for RandLora layers. + randlora_alpha (`float`): + The scaling coefficient for RandLora layers, this would typically be 20 times the rank. Because the + `randlora_alpha` coefficient is large by default, it can lead to numerical instabilities especially when + learning rates are high. If training is unstable, consider reducing the learning rate or the + `randlora_alpha` coefficient. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type. Can be 'none', 'all' or 'randlora_only'. If 'all' or 'randlora_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + modules_to_save (`list[str]`): + list of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint. + init_weights (`bool`): + Whether to initialize the weights of the RandLora layers with their default initialization. Don't change + this setting, except if you know exactly what you're doing. + layers_to_transform (`Union[list[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply the RandLora transformations + on the layer indexes that are specified in this list. If a single integer is passed, it will apply the + RandLora transformations on the layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None` and if the layer + pattern is not in the common layers pattern. + """ + + r: int = field(default=32, metadata={"help": "RandLora random basis rank"}) + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "list of module names or regex expression of the module names to replace with RandLora." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + projection_prng_key: int = field( + default=0, + metadata={ + "help": ( + "RandLora PRNG init key. Used for initialising basis_A and basis_B for new models or when loading a " + "checkpoint that did not include these projections." + ) + }, + ) + save_projection: bool = field( + default=True, + metadata={ + "help": ( + "Whether to save the basis_A / basis_B projections in the state dict alongside per layer lambda / " + "gamma weights. This will increase the size of the checkpoint, but guarantee that we can reload " + "the checkpoint on all system configurations." + ) + }, + ) + sparse: bool = field( + default=False, + metadata={ + "help": ( + "Whether to use sparse random bases as described in the RandLora paper." + "The current implementation is a proof of concept where the sparseness" + "is not used to improve speed or memory usage." + ) + }, + ) + very_sparse: bool = field( + default=False, + metadata={ + "help": ( + "Whether to use very sparse random bases." + "The current implementation is a proof of concept where the sparseness" + "is not used to improve speed or memory usage." + ) + }, + ) + randlora_dropout: float = field(default=0.0, metadata={"help": "Dropout in the adapter layers"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + randlora_alpha: int = field( + default=640, + metadata={ + "help": "Scaling coefficient in the adapter layers, typically 20 times the rank of the random bases." + }, + ) + bias: str = field( + default="none", metadata={"help": "Bias type for RandLora. Can be 'none', 'all' or 'randlora_only'"} + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "list of modules apart from RandLora layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the RandLora layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer" + " pattern is not in the common layers pattern." + ) + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.RANDLORA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + + if not self.save_projection: + warnings.warn( + "Specified to not save basis_A and basis_B within the state dictionary, instead they will be restored " + "using the PRNG key store in `config.projection_prng_key`. Consider setting `config.save_projection` " + "to `True` to guarantee restoring the checkpoint correctly on all system configurations." + ) diff --git a/peft/src/peft/tuners/randlora/layer.py b/peft/src/peft/tuners/randlora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..77ecbdafac05ee4b3cfa0472f7fe0af54603bc5d --- /dev/null +++ b/peft/src/peft/tuners/randlora/layer.py @@ -0,0 +1,350 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose + +from .._buffer_dict import BufferDict + + +class UniqueBaseGrad(torch.autograd.Function): + # Memory efficent for a unique base + @staticmethod + def forward(ctx, randlora_A, randlora_lambda, randlora_gamma): + out = randlora_lambda[:, :, None] * randlora_A * randlora_gamma[None,] + ctx.save_for_backward(randlora_A, randlora_lambda, randlora_gamma) + return out + + @staticmethod + def backward(ctx, grad_output): + randlora_A, randlora_lambda, randlora_gamma = ctx.saved_tensors + randlora_A, randlora_lambda, randlora_gamma = ( + randlora_A.to(grad_output.dtype), + randlora_lambda.to(grad_output.dtype), + randlora_gamma.to(grad_output.dtype), + ) + grad_randlora_lambda = torch.einsum("kbj,kvj,bj->kb", grad_output, randlora_A, randlora_gamma) + grad_randlora_gamma = torch.einsum("kbj,kvj,kb->bj", grad_output, randlora_A, randlora_lambda) + return None, grad_randlora_lambda, grad_randlora_gamma + + +class RandLoraLayer(BaseTunerLayer): + # List all names of layers that may contain adapter weights + adapter_layer_names = ("randlora_lambda", "randlora_gamma") + other_param_names = ("randlora_A", "randlora_B") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.scaling = {} + self.randlora_dropout = nn.ModuleDict({}) + + # For storing vector scale + self.randlora_lambda = nn.ParameterDict({}) + self.randlora_gamma = nn.ParameterDict({}) + + # Stores a reference to the randlora_A/B BufferDict. + # Set to `None` otherwise to avoid computation with random weights + self.randlora_A: Optional[BufferDict] = None + self.randlora_B: Optional[BufferDict] = None + + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + + # flag to enable/disable casting of input to weight dtype during forward call + self.cast_input_dtype_enabled = True + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + def update_layer( + self, + adapter_name, + randlora_A: BufferDict, + randlora_B: BufferDict, + r, + randlora_alpha, + randlora_dropout, + init_weights, + inference_mode: bool = False, + **kwargs, + ): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + self.r[adapter_name] = r + if randlora_dropout > 0.0: + randlora_dropout_layer = nn.Dropout(p=randlora_dropout) + else: + randlora_dropout_layer = nn.Identity() + + self.randlora_dropout.update(nn.ModuleDict({adapter_name: randlora_dropout_layer})) + + # Actual trainable parameters + num_bases = min(self.in_features, self.out_features) / r + self.num_bases = int(num_bases) if num_bases.is_integer() else int(num_bases) + 1 # Full rank + self.randlora_lambda[adapter_name] = nn.Parameter(torch.randn(r, self.num_bases), requires_grad=True) + self.randlora_gamma[adapter_name] = nn.Parameter( + torch.ones(self.num_bases, min(self.out_features, self.in_features)) + / max(self.out_features, self.in_features), + requires_grad=True, + ) + + self.scaling[adapter_name] = randlora_alpha / r + + # non trainable references to randlora_A/B buffers + self.randlora_A = randlora_A + self.randlora_B = randlora_B + if adapter_name not in randlora_A: + # This means that this is not the first RandLora adapter. We have to add an entry in the dict for this adapter. + if len(self.randlora_A) < 1: + raise ValueError( + "The `randlora_A` and `randlora_B` buffers are empty. This should not happen. Please report this issue." + ) + # we can take any of the existing adapter's parameters, as they should all be identical + randlora_A_param = list(self.randlora_A.values())[0] + randlora_B_param = list(self.randlora_B.values())[0] + + error_tmpl = ( + "{} has a size of {} but {} or greater is required; this probably happened because an additional RandLora " + "adapter was added after the first one with incompatible shapes." + ) + max_dim, min_dim = max(self.in_features, self.out_features), min(self.in_features, self.out_features) + # check input size + if randlora_B_param.shape[0] < max_dim: + raise ValueError(error_tmpl.format("randlora_B", randlora_B_param.shape[0], max_dim)) + # check output size + if randlora_A_param.shape[-1] < min_dim: + raise ValueError(error_tmpl.format("randlora_A", randlora_A_param.shape[1], min_dim)) + + # check r + error_tmpl = ( + "{} has a size of {} but {} or greater is required; this probably happened because an additional RandLora " + "adapter with a lower rank was added after the first one; loading the adapters " + "in reverse order may solve this." + ) + if randlora_A_param.shape[0] < self.r[adapter_name]: + raise ValueError(error_tmpl.format("randlora_A", randlora_A_param.shape[0], self.r[adapter_name])) + + if randlora_B_param.shape[-1] < self.r[adapter_name]: + raise ValueError(error_tmpl.format("randlora_B", randlora_B_param.shape[-1], self.r[adapter_name])) + + self.randlora_A[adapter_name] = randlora_A_param + self.randlora_B[adapter_name] = randlora_B_param + + if init_weights: + self.reset_randlora_parameters(adapter_name) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_randlora_parameters(self, adapter_name): + if adapter_name in self.randlora_lambda.keys(): + with torch.no_grad(): + nn.init.zeros_(self.randlora_lambda[adapter_name]) + nn.init.constant_(self.randlora_gamma[adapter_name], 1 / max(self.randlora_gamma[adapter_name].shape)) + + +class Linear(nn.Linear, RandLoraLayer): + # RandLora implemented in a dense layer + def __init__( + self, + base_layer, + randlora_A: BufferDict, + randlora_B: BufferDict, + adapter_name: str, + r: int = 0, + randlora_alpha: int = 0, + randlora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_weights: bool = True, + **kwargs, + ) -> None: + # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called + super(nn.Linear, self).__init__() + RandLoraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer(adapter_name, randlora_A, randlora_B, r, randlora_alpha, randlora_dropout, init_weights) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.randlora_lambda.keys(): + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights.to(orig_dtype) + else: + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data += delta_weight.to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + active_adapter = self.merged_adapters.pop() + if active_adapter in self.randlora_lambda.keys(): + delta_weight = self.get_delta_weight(active_adapter) + base_layer.weight.data -= delta_weight.to(orig_dtype) + + def get_scaled_bases(self, adapter, device=None) -> tuple[torch.Tensor, torch.Tensor]: + """ + Performs scaling on the smallest random base (randlora_A) and returns randlora_A and randlora_B in the correct + order to fit the target layers' dimensions + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + randlora_A = self.randlora_A[adapter] + randlora_B = self.randlora_B[adapter] + if device is None: + device = randlora_B.device + dtype = randlora_B.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + randlora_lambda = self.randlora_lambda[adapter].to(device) + randlora_gamma = self.randlora_gamma[adapter].to(device) + + if cast_to_fp32: + randlora_A = randlora_A.float() + randlora_B = randlora_B.float() + randlora_lambda = randlora_lambda.float() + randlora_gamma = randlora_gamma.float() + + # The trainable parameters are always applied to randlora_A, the smallest basis. + min_dim, max_dim = min(self.out_features, self.in_features), max(self.out_features, self.in_features) + + # As adapted layers may have different shapes and RandLora contains a single shared pair of A and B matrices, + # we initialize these matrices with the largest required size for each dimension. + # During the forward pass, required submatrices are sliced out from the shared randlora_A and randlora_B. + sliced_A = randlora_A[:, : self.num_bases, :min_dim].to(device) + sliced_B = randlora_B[:max_dim, : self.num_bases, :].to(device) + + # Flattening the matrices over the rank and number of bases dimensions is more memory efficient + update_B = sliced_B.flatten(start_dim=1) + update_A = UniqueBaseGrad.apply(sliced_A, randlora_lambda, randlora_gamma).flatten(end_dim=1) + + # Since update_A is applied on the smallest dimension, test whether update_A or update_B should be applied first. This is done to reduce trainable parameters. + if min_dim == self.in_features: + return update_A, update_B + return update_B.T, update_A.T + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + update_B, update_A = self.get_scaled_bases(adapter) + + update = (update_B.T @ update_A.T).T + output_tensor = transpose(update, self.fan_in_fan_out) + + scaling = self.scaling[adapter] + return output_tensor * scaling + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.randlora_lambda.keys(): + continue + dropout = self.randlora_dropout[active_adapter] + update_B, update_A = self.get_scaled_bases(active_adapter, device=x.device) + x = x.to(update_A.dtype) + scaling = self.scaling[active_adapter] + result = result + F.linear(F.linear(dropout(x), update_B), update_A) * scaling + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "randlora." + rep diff --git a/peft/src/peft/tuners/randlora/model.py b/peft/src/peft/tuners/randlora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3146438f3821aba61ac94c8e31243b00eb748011 --- /dev/null +++ b/peft/src/peft/tuners/randlora/model.py @@ -0,0 +1,356 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +import warnings +from typing import Union + +import torch +import torch.nn as nn +from accelerate.utils.imports import is_bf16_available +from transformers.pytorch_utils import Conv1D + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING, +) + +from .._buffer_dict import BufferDict +from ..tuners_utils import _maybe_include_all_linear_layers +from .config import RandLoraConfig +from .layer import Linear, RandLoraLayer + + +def _kaiming_init( + tensor_or_shape: Union[torch.Tensor, tuple[int, ...]], + generator: torch.Generator, +) -> torch.Tensor: + """ + Kaiming Uniform Initialisation adapted to accept a `torch.Generator` object for PRNG. + + Args: + tensor_or_shape (`Union[torch.Tensor, tuple[int, ...]]`): + Tensor to initialise, or shape of new tensor to create and then initialise. + generator: (`torch.Generator`): + Generator object that manages the state of the PRNG algorithm in use. + + Returns: + `torch.Tensor`: The initialised tensor. + """ + if isinstance(tensor_or_shape, tuple): + tensor = torch.empty( + tensor_or_shape, + dtype=torch.bfloat16 if is_bf16_available() else torch.float16, + ) + else: + tensor = tensor_or_shape + + with torch.no_grad(): + basis = torch.nn.init.kaiming_uniform_(tensor, a=math.sqrt(5), generator=generator) + return basis + + +class RandLoraModel(BaseTuner): + """ + Creates a RandLoRA model from a pretrained transformers model. + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`RandLoraConfig`]): The configuration of the RandLora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The RandLora model. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import RandLoraConfig, get_peft_model + + >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + >>> config = RandLoraConfig(r=32) + >>> model = get_peft_model(base_model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`RandLoraConfig`]): The configuration of the RandLora model. + """ + + prefix: str = "randlora_" + tuner_layer_cls = RandLoraLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING + + def _find_dim(self, config) -> tuple[int, int]: + """ + Finds the largest input and output dimensions across linear layers that have been wrapped with RandLora. + + This will be used for determining the size of the shared randlora_A and randlora_B matrices. + """ + model_config = self.get_model_config(self.model) + + peft_config = self._prepare_adapter_config(config, model_config) + peft_config = _maybe_include_all_linear_layers(peft_config, self.model) + + largest_shape = None + for key, module in self.model.named_modules(): + if not self._check_target_module_exists(peft_config, key): + continue + + if isinstance(module, nn.Linear): + module_shape = module.out_features, module.in_features + elif isinstance(module, Conv1D): + module_shape = module.weight.ds_shape if hasattr(module.weight, "ds_shape") else module.weight.shape + module_shape = module_shape[::-1] + else: + continue + + if largest_shape is None: + largest_shape = module_shape + continue + + if module_shape != largest_shape: + largest_shape = tuple(max(a, b) for a, b in zip(largest_shape, module_shape)) + + if largest_shape is None: + msg = "No layers types compatible with RandLora were found. Please check `peft_config.target_modules`." + raise ValueError(msg) + + return largest_shape + + def _init_randlora_A_randlora_B_sparse(self, config: RandLoraConfig, adapter_name: str, sparsity: int = 3) -> None: + """ + Sparse random projections as described in https://cs-people.bu.edu/evimaria/cs565/kdd-rp.pdf + """ + + linear_out_dim, linear_in_dim = self._find_dim(config) + max_dim, min_dim = max(linear_out_dim, linear_in_dim), min(linear_out_dim, linear_in_dim) + + # use of persistent to exclude randlora_A and randlora_B from the state dict if we choose not to save them. + self.randlora_A = BufferDict({}, persistent=config.save_projection) + self.randlora_B = BufferDict({}, persistent=config.save_projection) + + # deterministic init of randlora_A and randlora_B if we know the key + generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key) + + # The gamma matrix is applied on A meaning it can be unique (shared) across the n scaling matrices. + # We also set randlora_A as the smallest matrix to reduce trainable parameters. + randlora_A = torch.rand((config.r, 1, min_dim), generator=generator) + + # Number of bases to ensure full rank + num_bases = min_dim / config.r + num_bases = int(num_bases) if num_bases.is_integer() else int(num_bases) + 1 # Ensure full rank + randlora_B = torch.rand((max_dim, num_bases, config.r), generator=generator) + + # The current implementation is a proof of concept and does take into consideration + # the sparsity to reduce memory usage or speed up compute + randlora_B_sparse = torch.zeros(randlora_B.shape) + randlora_A_sparse = torch.zeros(randlora_A.shape) + randlora_B_sparse[randlora_B < 1 / (2 * sparsity)] = -1 + randlora_B_sparse[randlora_B > 1 - 1 / (2 * sparsity)] = 1 + randlora_A_sparse[randlora_A < 1 / (2 * sparsity)] = -1 + randlora_A_sparse[randlora_A > 1 - 1 / (2 * sparsity)] = 1 + + # Std normalization is empirically found to be the best + randlora_A, randlora_B = ( + randlora_A_sparse / randlora_A_sparse.std(), + randlora_B_sparse / randlora_B_sparse.std(), + ) + self.randlora_A[adapter_name] = randlora_A + self.randlora_B[adapter_name] = randlora_B + + def _init_randlora_A_randlora_B(self, config: RandLoraConfig, adapter_name: str) -> None: + linear_out_dim, linear_in_dim = self._find_dim(config) + max_dim, min_dim = max(linear_out_dim, linear_in_dim), min(linear_out_dim, linear_in_dim) + + # use of persistent to exclude randlora_A and randlora_B from the state dict if we choose not to save them. + self.randlora_A = BufferDict({}, persistent=config.save_projection) + self.randlora_B = BufferDict({}, persistent=config.save_projection) + + # deterministic init of randlora_A and randlora_B if we know the key + generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key) + + # The gamma matrix is applied on A meaning it can be unique (shared) across the n scaling matrices. + # We also set randlora_A as the smallest matrix to reduce trainable parameters. + randlora_A = _kaiming_init((config.r, 1, min_dim), generator=generator) + + # Ensure full rank + num_bases = min(linear_out_dim, linear_in_dim) / config.r + num_bases = int(num_bases) if num_bases.is_integer() else int(num_bases) + 1 + randlora_B = torch.cat( + [_kaiming_init((max_dim, 1, config.r), generator=generator) for _ in range(num_bases)], dim=1 + ) + + # Std normalization is empirically found to be the best + randlora_A, randlora_B = randlora_A / randlora_A.std(), randlora_B / randlora_B.std() + self.randlora_A[adapter_name] = randlora_A + self.randlora_B[adapter_name] = randlora_B + + def _pre_injection_hook(self, model: nn.Module, config: RandLoraConfig, adapter_name: str) -> None: + if config.very_sparse: + linear_out_dim, linear_in_dim = self._find_dim(config) + self._init_randlora_A_randlora_B_sparse( + config, adapter_name, sparsity=math.sqrt(min(linear_out_dim, linear_in_dim)) + ) + elif config.sparse: + self._init_randlora_A_randlora_B_sparse(config, adapter_name, sparsity=3) + else: + self._init_randlora_A_randlora_B(config, adapter_name) + + def _check_new_adapter_config(self, config: RandLoraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + super()._check_new_adapter_config(config) + + for existing_config in self.peft_config.values(): + if existing_config is config: + # skip the current config + continue + + if existing_config.projection_prng_key != config.projection_prng_key: + raise ValueError( + f"RandLora PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but " + f"previous config had {existing_config.projection_prng_key}." + ) + + save_project_unique_values = sorted({config.save_projection for config in self.peft_config.values()}) + if len(save_project_unique_values) > 1: + raise ValueError( + "RandLora projection weights must be saved for all adapters or none, but got multiple different values: " + f"{save_project_unique_values}" + ) + + def _create_and_replace( + self, + randlora_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + r = randlora_config.r + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": r, + "randlora_alpha": randlora_config.randlora_alpha, + "randlora_dropout": randlora_config.randlora_dropout, + "fan_in_fan_out": randlora_config.fan_in_fan_out, + "init_weights": randlora_config.init_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + kwargs["bias"] = bias + if isinstance(target, Linear): + target.update_layer( + adapter_name, + self.randlora_A, + self.randlora_B, + r, + randlora_config.randlora_alpha, + randlora_config.randlora_dropout, + randlora_config.init_weights, + ) + else: + new_module = self._create_new_module( + randlora_config, self.randlora_A, self.randlora_B, adapter_name, target, **kwargs + ) + if adapter_name not in self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(randlora_config, randlora_A, randlora_B, adapter_name, target, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import Linear8bitLt + + if is_bnb_4bit_available(): + from .bnb import Linear4bit + + bias = kwargs.pop("bias", False) + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + return Linear8bitLt(target, adapter_name, randlora_A, randlora_B, **eightbit_kwargs) + elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + return Linear4bit(target, adapter_name, randlora_A, randlora_B, **fourbit_kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = randlora_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = randlora_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + new_module = Linear( + target, + randlora_A, + randlora_B, + adapter_name, + bias=bias, + **kwargs, + ) + + return new_module diff --git a/peft/src/peft/tuners/road/__init__.py b/peft/src/peft/tuners/road/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..97b2f0f54fc113f3470c5c7835b0b2cec133319e --- /dev/null +++ b/peft/src/peft/tuners/road/__init__.py @@ -0,0 +1,47 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Based on implementation made available in https://github.com/ppetrushkov/peft/tree/road (not from paper authors) + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.utils import register_peft_method + +from .config import RoadConfig +from .layer import Linear, RoadLayer +from .model import RoadModel + + +__all__ = [ + "Linear", + "RoadConfig", + "RoadLayer", + "RoadModel", +] + +register_peft_method(name="road", config_cls=RoadConfig, model_cls=RoadModel, is_mixed_compatible=True) + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/road/bnb.py b/peft/src/peft/tuners/road/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..95e9b82b0ca7cee95b179762f6820467731a76d9 --- /dev/null +++ b/peft/src/peft/tuners/road/bnb.py @@ -0,0 +1,407 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional + +import bitsandbytes as bnb +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.integrations import dequantize_bnb_weight + +from .config import RoadVariant +from .layer import RoadLayer, _apply_road, _get_delta_weight + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, RoadLayer): + # Road implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + variant: RoadVariant = "road_1", + group_size: int = 64, + init_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + RoadLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + variant=variant, + group_size=group_size, + init_weights=init_weights, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + warnings.warn( + "Merge road module to 8-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8 + # dequantization directly + output = dequantize_bnb_weight(weight, state=state) + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + + w_data = torch.matmul(road_R, output.to(road_R.dtype)) + w_data = w_data.to(road_R.dtype).to(road_R.device).contiguous() + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + if self.get_base_layer().bias is not None: + bias = self.get_base_layer().bias + orig_dtype = bias.dtype + bias_data = bias.data + new_bias = torch.matmul(road_R, bias_data.to(road_R.dtype)) + bias.data = new_bias.to(orig_dtype) + + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + warnings.warn( + "Unmerge road module to 8-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + output = dequantize_bnb_weight(weight, state=state) + + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + inv_road_R = torch.linalg.inv(road_R.to(torch.float32)).to(road_R.dtype) + + w_data = torch.matmul(inv_road_R, output.to(road_R.dtype)) + w_data = w_data.to(road_R.dtype).to(road_R.device).contiguous() + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + + if self.get_base_layer().bias is not None: + bias = self.get_base_layer().bias + orig_dtype = bias.dtype + bias_data = bias.data + new_bias = torch.matmul(inv_road_R, bias_data) + bias.data = new_bias.to(orig_dtype) + + state.reset_grads() + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + result = self._cast_input_dtype(result, self.road_theta[active_adapter].dtype) + + result = _apply_road( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter], + self.road_alpha[active_adapter], + result, + ) + + if requires_conversion: + x = x.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "road." + rep + + def dispatch_bnb_8bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target.state.has_fp16_weights, + "threshold": target.state.threshold, + "index": target.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs) + + return new_module + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, RoadLayer): + # OFT implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + variant: RoadVariant = "road_1", + group_size: int = 64, + init_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + RoadLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + variant=variant, + group_size=group_size, + init_weights=init_weights, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + warnings.warn( + "Merge oft module to 4-bit linear may get different generations due to rounding errors." + ) + # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930 + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + + output = dequantize_bnb_weight(weight, state=weight.quant_state) + + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + w_data = torch.matmul(road_R, output.to(road_R.dtype)) + w_data = w_data.to(road_R.dtype).to(road_R.device) + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + # torch.compile can introduce attributes preceded by '_', remove them + kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")} + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + if self.get_base_layer().bias is not None: + bias = self.get_base_layer().bias + orig_dtype = bias.dtype + bias_data = bias.data + new_bias = torch.matmul(road_R, bias_data.to(road_R.dtype)) + bias.data = new_bias.to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + warnings.warn( + "Unmerge oft module to 4-bit linear may get different generations due to rounding errors." + ) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + output = dequantize_bnb_weight(weight, state=weight.quant_state) + + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + inv_road_R = torch.linalg.inv(road_R.to(torch.float32)).to(road_R.dtype) + + w_data = torch.matmul(inv_road_R, output.to(road_R.dtype)) + w_data = w_data.to(road_R.dtype).to(road_R.device) + + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + kwargs["requires_grad"] = False + kwargs.pop("data", None) + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), **kwargs).to(weight.device) + + if self.get_base_layer().bias is not None: + bias = self.get_base_layer().bias + orig_dtype = bias.dtype + bias_data = bias.data + new_bias = torch.matmul(inv_road_R, bias_data) + bias.data = new_bias.to(orig_dtype) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + # result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + result = self._cast_input_dtype(result, self.road_theta[active_adapter].dtype) + + result = _apply_road( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter], + self.road_alpha[active_adapter], + result, + ) + if requires_conversion: + x = x.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + def dispatch_bnb_4bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + if loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, **fourbit_kwargs) + + return new_module diff --git a/peft/src/peft/tuners/road/config.py b/peft/src/peft/tuners/road/config.py new file mode 100644 index 0000000000000000000000000000000000000000..50125786c5ce10ca1546f949be30d8dd52a56422 --- /dev/null +++ b/peft/src/peft/tuners/road/config.py @@ -0,0 +1,126 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +RoadVariant = Literal["road_1", "road_2", "road_4"] + + +@dataclass +class RoadConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`RoadModel`]. RoAd adapter is proposed in + https://arxiv.org/pdf/2409.00119. + + Args: + variant (Union[`RoadVariant`, `str`]): + The variant of the Road model to use. It can be one of road_1, road_2, or road_4. Refer to the paper for + more details. + - road_1: Uses the same scale and angle for all pairs of elements. + This variant has lowest number of parameters, it stores a number equal to the output hidden size of + parameters for each layer that RoAd is applied to. + - road_2: Uses the same scale and angle for each element. + This variant has 2x the number of parameters compared to road_1. + - road_4: Uses two different scales and angles for each ellement. + This variant has 4x the number of parameters compared to road_1. + group_size (`int`): + Group size defines how elements are grouped together into 2D vectors for rotation. Within each group + element 0 is paired with element group_size/2, then element 1 is paired with element group_size/2+1 and so + on. This has no effect on the model performance, since elements are unordered, however it has some effect + on inference speed when used in e.g. VLLM. For best speed group size of at least 32 or 64 (the default) is + recommended. Note that model hidden size (or hidden size per partition when used with tensor parallelism) + must be divisible by group_size, so for very small models you might need to reduce this parameter. + init_weights (`bool`): + Whether to perform initialization of RoAd weights. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen (if + the model is a PreTrainedModel, the output layer excluded). If this is not specified, modules will be + chosen according to the model architecture. If the architecture is not known, an error will be raised -- in + this case, you should specify the target modules manually. + modules_to_save (`List[str]`): + List of modules apart from Road layers to be set as trainable and saved in the final checkpoint. + """ + + variant: Union[str, RoadVariant] = field( + default="road_1", + metadata={"help": ("Variant of the Road model to use.")}, + ) + group_size: int = field( + default=64, + metadata={ + "help": ( + "Group size defines how elements are grouped together into 2D vectors for rotation. " + "Within each group element 0 is paired with element group_size/2, " + "then element 1 is paired with element group_size/2+1 and so on. " + "This has no effect on the model performance, since elements are unordered, " + "however it has some effect on inference speed when used in e.g. VLLM. " + "For best speed group size of at least 64 is recommended. " + "Note that model hidden size (or hidden size per partition when used with tensor parallelism) " + "must be divisible by group_size, so for very small models you might need to reduce this parameter." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the RoAd layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with Road." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D " + "(if the model is a PreTrainedModel, the output layer excluded)." + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually." + ), + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from RoAd layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.ROAD + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + if self.variant not in ["road_1", "road_2", "road_4"]: + raise ValueError(f"Invalid variant {self.variant} specified. Please choose from road_1, road_2 or road_4") + if self.group_size <= 0 or self.group_size % 2 != 0: + raise ValueError(f"The group_size must be divisible by 2 when using RoadLayer, but got {self.group_size}.") diff --git a/peft/src/peft/tuners/road/layer.py b/peft/src/peft/tuners/road/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d59dc056d5d293315aec29e089cca99a4d9a3279 --- /dev/null +++ b/peft/src/peft/tuners/road/layer.py @@ -0,0 +1,418 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .config import RoadConfig, RoadVariant + + +class RoadLayer(BaseTunerLayer): + """ + Road layer. + + Generally the idea of RoAD is to split the input vector into many 2D vectors and rotate each 2D vector with its own + 2D rotation matrix. For additional flexibility, each rotation matrix is multiplied by a trainable scale. + + when applied to vector R @ x each pair of elements of x is transformed like this: `y₀ = x₀ * α * cosθ - xₙ * α * + sinθ` and `yₙ = x₀ * α * sinθ + xₙ * α * cosθ` + + The scales α and angles θ are learned for each pair of elements and, moreover, each of the 4 instances in the + rotation matrix may actually be different (when using variant 2 or 4). + + Note that instead of using two consecutive elements x₀ x₁ we first split the whole vector into groups and pair + elements from the first with the second half of the same group, which allows for more efficient inference + implementation. + + The adapter needs to only store the angles θ and scales α, rather than the full matrix R and the inference + implementation only needs to do elementwise vector multiplications. + + For merging the weights, we make use of the following formula: R @ (W @ x + b) = (R @ W) @ x + R @ b. The lhs part + is how it is used in unmerged state (using efficient elementwise implementation instead of matrix multiplication) + and the rhs part is how it is used in merged state where (R @ W) becomes the new weight matrix and R @ b becomes + the new bias. + + """ + + adapter_layer_names: tuple[str, ...] = ("road_theta", "road_alpha") + other_param_names: tuple[str, ...] = ("variant", "group_size") + + def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None: + self.base_layer = base_layer + self.variant = {} + self.group_size = {} + self.road_theta = nn.ParameterDict({}) + self.road_alpha = nn.ParameterDict({}) + + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type '{type(base_layer)}' encountered, cannot apply RoAd adapter.") + self.in_features = in_features + self.out_features = out_features + + @property + def _available_adapters(self) -> set[str]: + return {*self.road_theta} + + def update_layer( + self, + adapter_name, + variant, + group_size, + init_weights, + inference_mode: bool = False, + ): + self.variant[adapter_name] = variant + self.group_size[adapter_name] = group_size + + if self.out_features % group_size != 0: + raise ValueError( + f"The out_features of the base layer must be divisible by group_size ({group_size}) when using RoadLayer." + ) + + # Actual trainable parameters + if variant == "road_1": + size = self.out_features // 2 + elif variant == "road_2": + size = self.out_features + elif variant == "road_4": + size = self.out_features * 2 + else: + raise ValueError( + f"Unsupported variant {variant} for RoadLayer. Supported variants are road_1, road_2, and road_4." + ) + self.road_theta[adapter_name] = nn.Parameter(torch.empty(size)) + self.road_alpha[adapter_name] = nn.Parameter(torch.empty(size)) + + self.reset_parameters(adapter_name, init_weights) + self._move_adapter_to_device_of_base_layer(adapter_name) + + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_parameters(self, adapter_name, init_weights): + if init_weights is False: + nn.init.normal_(self.road_theta[adapter_name].data, mean=0.0, std=0.5) + nn.init.normal_(self.road_alpha[adapter_name].data, mean=1.0, std=0.5) + return + nn.init.zeros_(self.road_theta[adapter_name].data) + nn.init.ones_(self.road_alpha[adapter_name].data) + + +class Linear(nn.Module, RoadLayer): + # Road implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + variant: RoadVariant = "road_1", + group_size: int = 64, + init_weights: Union[bool, str] = True, + **kwargs, + ) -> None: + super().__init__() + RoadLayer.__init__(self, base_layer, **kwargs) + + self._active_adapter = adapter_name + + self.update_layer( + adapter_name, + variant, + group_size, + init_weights=init_weights, + ) + + def _check_forward_args(self, x, *args, **kwargs): + """Check if the arguments are compatible with the configs and state of the model""" + adapter_names = kwargs.get("adapter_names", None) + if adapter_names is None: + return + + if len(x) != len(adapter_names): + msg = ( + "Length of `adapter_names` should be the same as the number of inputs, but got " + f"{len(adapter_names)} and {len(x)} respectively." + ) + raise ValueError(msg) + + if self.merged: + # It is unclear what would be the right thing to do if users pass adapter_names and there are merged + # adapters. Therefore, it is better to raise an error in this case. + msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first." + raise ValueError(msg) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + result = self._cast_input_dtype(result, self.road_theta[active_adapter].dtype) + result = _apply_road( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter], + self.road_alpha[active_adapter], + result, + ) + + result = result.to(torch_result_dtype) + + return result + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + result = self.base_layer(x, *args, **kwargs) + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self._available_adapters: + continue + + dtype = self.road_theta[active_adapter].data.dtype + + # getting the sub-batch, passing it to Road layers and updating the corresponding indices of the linear + # layer output + sub_batch = result[sub_batch_indices_list[i]].to(dtype) + result[sub_batch_indices_list[i]] = _apply_road( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter], + self.road_alpha[active_adapter], + sub_batch, + ) + + return result + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + orig_dtype = base_layer.weight.dtype + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weight = base_layer.weight.data.clone() + orig_weight = torch.matmul(road_R.to(orig_dtype), orig_weight) + + if not torch.isfinite(orig_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + + if base_layer.bias is not None: + orig_bias = base_layer.bias.clone() + orig_bias = torch.matmul(road_R.to(orig_dtype), orig_bias) + + if not torch.isfinite(orig_bias).all(): + raise ValueError( + f"NaNs detected in the merged bias. The adapter {active_adapter} seems to be broken" + ) + + base_layer.bias.data = orig_bias.contiguous().to(orig_dtype) + else: + orig_weight = base_layer.weight.data + orig_weight = torch.matmul(road_R.to(orig_dtype), orig_weight) + base_layer.weight.data = orig_weight.contiguous().to(orig_dtype) + + if base_layer.bias is not None: + orig_bias = base_layer.bias.data + orig_bias = torch.matmul(road_R.to(orig_dtype), orig_bias) + base_layer.bias.data = orig_bias.contiguous().to(orig_dtype) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + # Going in reverse order + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + weight = self.get_base_layer().weight + orig_dtype = weight.dtype + road_R = _get_delta_weight( + self.variant[active_adapter], + self.group_size[active_adapter], + self.road_theta[active_adapter].data, + self.road_alpha[active_adapter].data, + ) + # Since our matrix are not necessarily orthogonal we need inverse instead of transpose. + # In practice we expect this to basically always work since we start from block diagonal rotation matrix. + inv_road_R = torch.linalg.inv(road_R.to(torch.float32)).to(orig_dtype) + orig_weight = torch.matmul(inv_road_R, weight.data) + weight.data = orig_weight.contiguous() + + if self.get_base_layer().bias is not None: + orig_bias = torch.matmul(inv_road_R, self.get_base_layer().bias.data) + self.get_base_layer().bias.data = orig_bias.contiguous() + + def __repr__(self) -> str: + rep = super().__repr__() + return "road." + rep + + +def _get_delta_weight(variant: RoadVariant, group_size: int, road_theta: torch.Tensor, road_alpha: torch.Tensor): + first_col, second_col = _prepare_cols(variant, group_size, road_theta, road_alpha) + + # To help understand the logic below consider how rope embeddings work + # here it is similar, but done in groups. + # https://discuss.huggingface.co/t/is-llama-rotary-embedding-implementation-correct/44509/3 + + # First column is simply put on the main diagonal + output_tensor = torch.diag(first_col) + # For second column we need to swap each half groups and add minus sign + size = second_col.shape[0] + swapped_second_col = second_col.reshape(-1, 2, group_size // 2)[:, [1, 0], :].flatten() + rotated_diag_second_col = torch.diag(swapped_second_col).reshape(-1, 2, group_size // 2, size)[:, [1, 0], :, :] + rotated_diag_second_col[:, 0, :, :] *= -1 + rotated_diag_second_col = rotated_diag_second_col.reshape(size, size) + output_tensor += rotated_diag_second_col + + return output_tensor + + +def _prepare_cols( + variant: RoadVariant, group_size: int, road_theta: torch.Tensor, road_alpha: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + # In inference mode, this can be cached + if variant == "road_1": + # In each group there are only group_size // 2 parameters that are reused + road_theta = road_theta.reshape(-1, group_size // 2).repeat_interleave(2, dim=0).flatten() + road_alpha = road_alpha.reshape(-1, group_size // 2).repeat_interleave(2, dim=0).flatten() + + theta_cos = road_theta.cos() + theta_sin = road_theta.sin() + + first_col = road_alpha * theta_cos + second_col = road_alpha * theta_sin + elif variant == "road_2": + # Each group has exactly group_size parameters + theta_cos = road_theta.cos() + theta_sin = road_theta.sin() + + first_col = road_alpha * theta_cos + second_col = road_alpha * theta_sin + elif variant == "road_4": + # Each group has 2*group_size parameters, first half used for first column, second half for second column + road_theta = road_theta.reshape(-1, 2, group_size) + theta_cos = road_theta[:, 0, :].cos().flatten() + theta_sin = road_theta[:, 1, :].sin().flatten() + road_alpha = road_alpha.reshape(-1, 2, group_size) + alpha_1 = road_alpha[:, 0, :].flatten() + alpha_2 = road_alpha[:, 1, :].flatten() + + first_col = alpha_1 * theta_cos + second_col = alpha_2 * theta_sin + else: + raise ValueError( + f"Unsupported variant {variant} for RoadLayer. Supported variants are road_1, road_2, and road_4." + ) + + return first_col, second_col + + +def _apply_road( + variant: RoadVariant, group_size: int, road_theta: torch.Tensor, road_alpha: torch.Tensor, x: torch.Tensor +): + first_col, second_col = _prepare_cols(variant, group_size, road_theta, road_alpha) + + # Split in half groups and join back + # See equation 4 in the RoAD paper + x_grouped = x.reshape(-1, 2, group_size // 2) + x1 = x_grouped[:, 0, :] + x2 = x_grouped[:, 1, :] + rotate_half_x = torch.stack((-x2, x1), dim=1).reshape(x.shape) + result = x * first_col + rotate_half_x * second_col + return result + + +def dispatch_default( + target: torch.nn.Module, + adapter_name: str, + road_config: RoadConfig, + **kwargs, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + new_module = Linear(target, adapter_name, **kwargs) + + return new_module diff --git a/peft/src/peft/tuners/road/model.py b/peft/src/peft/tuners/road/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e2285c4451822baf990ab57cdf6af702d969dacf --- /dev/null +++ b/peft/src/peft/tuners/road/model.py @@ -0,0 +1,163 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import operator +from contextlib import contextmanager +from functools import partial + +from torch import nn + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.road.config import RoadConfig +from peft.tuners.tuners_utils import ( + BaseTuner, +) +from peft.utils import TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING + +from .layer import RoadLayer, dispatch_default + + +def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names): + # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference + kwargs["adapter_names"] = adapter_names + return args, kwargs + + +class RoadModel(BaseTuner): + """ """ + + prefix: str = "road_" + tuner_layer_cls = RoadLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + road_config: RoadConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + current_key, + ) -> None: + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + # Regexp matching - Find key which matches current target_name in patterns provided + variant = road_config.variant + group_size = road_config.group_size + + kwargs = { + "variant": variant, + "group_size": group_size, + "init_weights": road_config.init_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + # for torchao merging, we need the get_apply_tensor_subclass from the quantization config + try: + kwargs["get_apply_tensor_subclass"] = operator.attrgetter( + "hf_quantizer.quantization_config.get_apply_tensor_subclass" + )(self.model) + except AttributeError: + pass + + if isinstance(target, RoadLayer): + target.update_layer( + adapter_name, + variant, + group_size, + init_weights=road_config.init_weights, + ) + else: + device_map = self.model.hf_device_map if hasattr(self.model, "hf_device_map") else None + new_module = self._create_new_module(road_config, adapter_name, target, device_map=device_map, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(road_config: RoadConfig, adapter_name, target, **kwargs): + dispatchers = [] + + # avoid eager bnb import + if is_bnb_available(): + from .bnb import dispatch_bnb_8bit + + dispatchers.append(dispatch_bnb_8bit) + + if is_bnb_4bit_available(): + from .bnb import dispatch_bnb_4bit + + dispatchers.append(dispatch_bnb_4bit) + + dispatchers.extend( + [ + dispatch_default, + ] + ) + + new_module = None + for dispatcher in dispatchers: + new_module = dispatcher(target, adapter_name, road_config=road_config, **kwargs) + if new_module is not None: # first match wins + break + + if new_module is None: + # no module could be matched + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + return new_module + + @contextmanager + def _enable_peft_forward_hooks(self, *args, **kwargs): + # If adapter_names is passed as an argument, we inject it into the forward arguments. + adapter_names = kwargs.pop("adapter_names", None) + if adapter_names is None: + # nothing to do + yield + return + + if self.training: + raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") + + # Check that users only passed actually existing adapters. + # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want + # to check that there is at least one layer with the given name, or else something like typos can easily slip. + expected_adapters = set() + for layer in self.modules(): + if isinstance(layer, RoadLayer): + expected_adapters |= layer.road_theta.keys() + unique_adapters = {name for name in adapter_names if name != "__base__"} + unexpected_adapters = unique_adapters - expected_adapters + if unexpected_adapters: + raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") + + hook_handles = [] + for module in self.modules(): + if isinstance(module, RoadLayer): + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + # TODO LoRA also has hooks for beam search, ignore this for now + + yield + + for handle in hook_handles: + handle.remove() diff --git a/peft/src/peft/tuners/shira/__init__.py b/peft/src/peft/tuners/shira/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d5391b96059fc08580a12adb8b216b3a8bfd8024 --- /dev/null +++ b/peft/src/peft/tuners/shira/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import ShiraConfig +from .layer import Linear, ShiraLayer +from .model import ShiraModel + + +__all__ = ["Linear", "ShiraConfig", "ShiraLayer", "ShiraModel"] + + +register_peft_method( + name="shira", config_cls=ShiraConfig, model_cls=ShiraModel, prefix="shira_", is_mixed_compatible=True +) diff --git a/peft/src/peft/tuners/shira/config.py b/peft/src/peft/tuners/shira/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d868e51490637c47c13b3c0ae2cb1f5c88cb7ebc --- /dev/null +++ b/peft/src/peft/tuners/shira/config.py @@ -0,0 +1,129 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + +from .mask_functions import random_mask + + +@dataclass +class ShiraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`ShiraModel`]. + + Args: + r (`int`, *optional*, defaults to `32`): + For a given target module, the number of SHiRA parameters is computed as r(m+n), where the original tensor + dimensions are m x n. This means the number of SHiRA parameters is the same as that for a LoRA adapter. + SHiRA is a high rank adapter. Setting this r parameter does not restrict the rank to this value. + mask_type (`str`, defaults to `random`): + Type of mask function. Defaults to a random sparse mask. An optional user-defined mask_fn to compute the + mask value can also be supplied by instantiating `config = ShiraConfig(...)` and then setting + `config.mask_fn = `. For a pretrained weight with shape m x n, the custom mask + function must return only one mask (shape: m x n) which must be binary 0 or 1 with num_shira_parameters = + r(m + n) for linear layers. Device and dtype of mask must be same as base layer's weight's device and + dtype. Please see mask_functions.py for more details and to see the default random sparse mask + implementation. + random_seed (`int`, *optional*, defaults to `None`): + random seed for the torch generator for random_mask. + target_modules (`Union[List[str], str]`): + List of module names or regex expression of the module names to replace with SHiRA. For example, ['q', 'v'] + or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. Only linear layers are supported. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + init_weights (`bool`, defaults to `True`): + Initialize SHiRA weight to have zero values. If set to False, SHiRA weights are initialized to randn values + instead of zeros and this is used only for testing. + modules_to_save (`List[str]`): + List of modules apart from SHiRA layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field( + default=32, + metadata={ + "help": ( + "For a given target module, the number of SHiRA parameters is computed as r(m+n), where the original " + "tensor dimensions are m x n. This means the number of SHiRA parameters is the same as that for a LoRA adapter. " + "SHiRA is a high rank adapter. Setting this r parameter does not restrict the rank to this value." + ) + }, + ) + mask_type: Literal["random"] = field( + default="random", + metadata={ + "help": ( + "Type of mask function. Defaults to a random sparse mask. " + "An optional user-defined mask_fn to compute the mask value can also be supplied by instantiating `config = ShiraConfig(...)` and then setting " + "`config.mask_fn = `. For a pretrained weight with shape m x n, the custom mask function must return only one mask (shape: m x n) " + "which must be binary 0 or 1 with num_shira_parameters = r(m + n) for linear layers. Device and dtype of mask must be same as base layer's weight's device and dtype. " + "Please see mask_functions.py for more details and to see the default random sparse mask implementation." + ) + }, + ) + random_seed: Optional[int] = field( + default=None, metadata={"help": "random seed for the torch generator for random_mask"} + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with SHiRA." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": "Initialize SHiRA weight to have zero values. If set to False, SHiRA weights are initialized to randn values instead of zeros and this is used only for testing." + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from SHiRA layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.SHIRA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + if self.mask_type == "random": + self.mask_fn = random_mask + else: + if not self.inference_mode: + warnings.warn( + f"Argument {self.mask_type=} is not recognized, please supply your own masking function by calling `config.mask_fn = my_mask_fn`." + ) + self.mask_fn = None diff --git a/peft/src/peft/tuners/shira/layer.py b/peft/src/peft/tuners/shira/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..15fb4f3c32b41396ca44d9b78b7d0dc6157fd545 --- /dev/null +++ b/peft/src/peft/tuners/shira/layer.py @@ -0,0 +1,217 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import warnings +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + + +class ShiraLayer(BaseTunerLayer): + # List all names of layers that may contain trainable adapter weights + adapter_layer_names = ("shira_weight",) + # All names of other adapter-related parameters + other_param_names = ("r", "scaling", "shira_indices") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.scaling = {} + self.shira_weight = nn.ParameterDict({}) + self.shira_indices = {} + self.weight_shape = base_layer.weight.shape # Assumes SHiRA is on some layer with "weight" parameter + + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + raise NotImplementedError("Only nn.Linear layers supported currently") + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + def update_layer( + self, + adapter_name, + mask, + r, + init_weights: bool = True, + inference_mode: bool = False, + **kwargs, + ): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + self.r[adapter_name] = r + self.scaling[adapter_name] = ( + 1.0 # Default scale during training. Can be set to any (non-negative) value during inference. + ) + # The number of shira weights in this layer is determined by r such that the total number of weights is the same as a LoRA Layer (for direct comparisons) + num_shira_weight = r * (self.in_features + self.out_features) + if num_shira_weight > self.in_features * self.out_features: + raise ValueError( + f"The set rank {r} results in more shira params than the total number of params in the base layer {self.in_features * self.out_features} and this is not allowed." + ) + + # Actual trainable parameters + # We have used a vector parameter with fixed indices that we use inside a torch.sparse_coo_tensor in get_delta_weight function. + # Directly using a torch.sparse_coo_tensor as a parameter could have been possible but we ran into some issues similar to: + # https://github.com/pytorch/pytorch/issues/79542. + shira_init_weight = torch.zeros(num_shira_weight) if init_weights else torch.randn(num_shira_weight) + self.shira_weight[adapter_name] = nn.Parameter( + shira_init_weight.to(self.base_layer.weight.dtype).to(self.base_layer.weight.device), + requires_grad=True, + ) + + if mask is not None: + # Compute the shira_indices from the mask. Make sure the mask is formed using r*(self.in_features + self.out_features) and not some other K. + mask_indices = torch.where(mask == 1.0) + self.shira_indices[adapter_name] = torch.cat( + [mask_indices[0].unsqueeze(0), mask_indices[1].unsqueeze(0)], 0 + ).to(torch.int) + self.shira_indices[adapter_name] = self.shira_indices[adapter_name].to(self.base_layer.weight.device) + + if self.shira_indices[adapter_name].shape[1] != self.shira_weight[adapter_name].shape[0]: + raise ValueError( + f"The SHiRA indices and weights are not the same dimensions for adapter {adapter_name} in layer {self.base_layer}" + ) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_shira_parameters(self, adapter_name): + nn.init.zeros_(self.shira_weight[adapter_name]) + + def set_scale(self, adapter, scale): + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + self.scaling[adapter] = scale + + +class Linear(nn.Module, ShiraLayer): + # SHiRA implemented in a dense layer + def __init__( + self, + base_layer, + mask, + adapter_name: str, + r: int = 0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stored weight like (fan_in, fan_out) + init_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + ShiraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + if self.base_layer is not self.get_base_layer(): + raise ValueError("SHiRA does not support nested base layers") + + self._active_adapter = adapter_name + self.update_layer(adapter_name, mask, r, init_weights=init_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.shira_weight.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.shira_weight.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + + # In multi-gpu environment, the indices are at the wrong gpu. This is needed to correct this. + self.shira_indices[adapter] = self.shira_indices[adapter].to(self.shira_weight[adapter].device) + return torch.sparse_coo_tensor( + self.shira_indices[adapter], self.shira_weight[adapter] * self.scaling[adapter], self.weight_shape + ) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + new_weight = copy.deepcopy(self.base_layer.weight.data) + for active_adapter in self.active_adapters: + if active_adapter not in self.shira_weight.keys(): + continue + new_weight += self.get_delta_weight(active_adapter) + + result = F.linear(x, new_weight, bias=self.base_layer.bias) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "shira." + rep diff --git a/peft/src/peft/tuners/shira/mask_functions.py b/peft/src/peft/tuners/shira/mask_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..d8afbfa8ed23589c74306b0d6e2da071e2f64a11 --- /dev/null +++ b/peft/src/peft/tuners/shira/mask_functions.py @@ -0,0 +1,72 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This module is intended to store mask functions for use inside SHiRA construction. The mask functions are required to +have a specific signature as shown below. + +Required positional arguments: + base_layer - This is the linear layer where the shira adapter will be attached. r - This parameter is used to + determine the number of parameters in the + shira adapter in a way that is consistent with LoRA sizing. SHiRA is a high rank adapter. Setting this + parameter does not restrict the adapter rank. +Keyword arguments can be provided as needed by the particular mask function implementation. + +Return: + mask - this is a torch.tensor of the same shape as base_layer.weight that contains 0s and 1s with the same + dtype and device as base_layer.weight + +If you would like to attach SHiRA adapters to a model using PEFT methods (such as get_peft_model()), using more +arguments than the provided positional arguments, you can create the mask function reference like the following: + +``` + def create_mask_function_reference(**my_kwargs): + def mask_fn(base_layer, r): + ... your implementation here that might use my_kwargs ... + return mask + return mask_fn +``` +Then, you can create your peft model with custom SHiRA mask as follows: +``` + model = ... + my_kwargs = ... + mask_fn = create_mask_function_reference(**my_kwargs) + peft_config = ShiraConfig(r=4, mask_type='my_custom_mask') + peft_config.mask_fn = mask_fn + peft_model = get_peft_model(model, peft_config) +``` + +Complete training examples are provided in the examples/shira/ directory. +""" + +from typing import Optional + +import torch +import torch.nn as nn + + +def random_mask(base_layer: nn.Module, r: int, random_seed: Optional[int] = None, **kwargs) -> torch.tensor: + shape = base_layer.weight.shape + num_shira_weights = r * (shape[0] + shape[1]) + random_generator = torch.Generator() + if random_seed is not None: + random_generator.manual_seed(random_seed) + idx = (torch.randperm(base_layer.weight.numel(), generator=random_generator)[:num_shira_weights]).to( + base_layer.weight.device + ) + val = torch.ones_like(idx.type(base_layer.weight.dtype)) + mask = torch.zeros_like(base_layer.weight.view(1, -1)) + mask = mask.scatter_(1, idx.unsqueeze(0), val.unsqueeze(0)).view(shape) + + return mask diff --git a/peft/src/peft/tuners/shira/model.py b/peft/src/peft/tuners/shira/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c212a9bf2c110bba5a180eaf2cac0bda9a93ee6e --- /dev/null +++ b/peft/src/peft/tuners/shira/model.py @@ -0,0 +1,142 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings + +import torch + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING, +) + +from .layer import Linear, ShiraLayer + + +class ShiraModel(BaseTuner): + """ + Creates a Sparse High Rank Adapter (SHiRA) Model from a pretrained model. + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`ShiraConfig`]): The configuration of the SHiRA model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The SHiRA model. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import ShiraConfig, get_peft_model + + >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + >>> config = ShiraConfig(r=32) + >>> model = get_peft_model(base_model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`ShiraConfig`]): The configuration of the SHiRA model. + """ + + prefix: str = "shira_" + tuner_layer_cls = ShiraLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING + + def _create_and_replace( + self, + shira_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = {} + kwargs["bias"] = bias + if shira_config.mask_type == "random": + kwargs["random_seed"] = shira_config.random_seed + + for k, v in optional_kwargs.items(): + kwargs[k] = v + + if isinstance(target, Linear): + mask = ( + shira_config.mask_fn(target.base_layer, shira_config.r, **kwargs) + if shira_config.mask_fn is not None + else None + ) + target.update_layer( + adapter_name, + mask, + shira_config.r, + init_weights=shira_config.init_weights, + ) + else: + new_module = self._create_new_module(shira_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(shira_config, adapter_name, target, **kwargs): + fan_in_fan_out = shira_config.fan_in_fan_out + + _ = kwargs.pop("bias", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if fan_in_fan_out: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + fan_in_fan_out = shira_config.fan_in_fan_out = False + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + mask = ( + shira_config.mask_fn(target_base_layer, shira_config.r, **kwargs) + if shira_config.mask_fn is not None + else None + ) + + new_module = Linear( + target, + mask, + adapter_name, + shira_config.r, + fan_in_fan_out, + init_weights=shira_config.init_weights, + **kwargs, + ) + + return new_module diff --git a/peft/src/peft/tuners/trainable_tokens/__init__.py b/peft/src/peft/tuners/trainable_tokens/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa7bf8e5fc4705cb7b190cee0de53ac8db89573 --- /dev/null +++ b/peft/src/peft/tuners/trainable_tokens/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import TrainableTokensConfig +from .layer import TrainableTokensLayer +from .model import TrainableTokensModel + + +__all__ = [ + "TrainableTokensConfig", + "TrainableTokensLayer", + "TrainableTokensModel", +] + +register_peft_method( + name="trainable_tokens", + config_cls=TrainableTokensConfig, + model_cls=TrainableTokensModel, + is_mixed_compatible=False, +) diff --git a/peft/src/peft/tuners/trainable_tokens/config.py b/peft/src/peft/tuners/trainable_tokens/config.py new file mode 100644 index 0000000000000000000000000000000000000000..7412d7f06474c510679e0f3004ae10c20910b00f --- /dev/null +++ b/peft/src/peft/tuners/trainable_tokens/config.py @@ -0,0 +1,89 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class TrainableTokensConfig(PeftConfig): + """ + Configuration for the `TrainableTokens` method. + + Allows for training new tokens (and re-training existing ones) without training the full embedding matrix. By + marking a few select tokens (identified by their indices) trainable and leaving the rest untouched, this method can + be used to add new tokens or changing the embedding of existing tokens while saving on memory. Both storage as well + as working memory usage are reduced in contrast to training the embedding matrix fully. + + Note that training with FSDP/DeepSpeed might not yet be fully supported. + + Args: + token_indices (`list[int]`): + List of integers, signifying the indices of the tokens you want to be trainable. To find the index of a + token with a tokenizer, you can tokenize the string and look at the returned `input_ids`. The closer the + amount of indices is to the total amount of tokens, the less efficient this method gets. + target_modules (`Optional[Union[list[str], str]]`): + List of module names or regex expression of the module names to replace with our `TrainableTokensLayer`. If + not defined, it will attempt to get the model's input embedding layer if the model has a + `get_input_embeddings` method (transformer models usually do), if that fails the default is 'embed_tokens'. + Other example targets are `embedding`, `encoder.embeddings` or `decoder.embeddings`. + init_weights (`bool`): + By default the new token weights are initialized to be the same as the respective token embeddings. This + makes TrainableTokens a no-op when not trained. If set to `False` the weights will be random values. Do not + change this setting unless you know exactly what you're doing. + """ + + token_indices: list[int] = field( + default_factory=list, + metadata={ + "help": ( + "List of integers, signifying the indices of the tokens you want to be trainable. " + "To find the index of a token with a tokenizer, you can tokenize the string and " + "look at the returned `input_ids`. The closer the amount of indices is to the total amount of " + "tokens, the less efficient this method gets." + ) + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with our " + "`TrainableTokensLayer`. If not defined, it will default to the model's input embedding layer if " + "the model has a `get_input_embeddings` method (transformer models usually do), if that fails the " + "default is 'embed_tokens'. Other example targets could be `embedding`, `encoder.embeddings` or " + "`decoder.embeddings`." + ), + }, + ) + + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "By default the new token weights are initialized to be the same as the respective token embeddings. " + "This makes TrainableTokens a no-op when not trained. If set to `False` the weights will be random " + "values. Do not change this setting unless you know exactly what you're doing. " + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.TRAINABLE_TOKENS diff --git a/peft/src/peft/tuners/trainable_tokens/layer.py b/peft/src/peft/tuners/trainable_tokens/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..0f35462224e178f340d23309f6a8420914d24b31 --- /dev/null +++ b/peft/src/peft/tuners/trainable_tokens/layer.py @@ -0,0 +1,249 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F + +from peft.tuners._buffer_dict import BufferDict +from peft.tuners.tuners_utils import BaseTunerLayer, _get_in_out_features, check_adapters_to_merge +from peft.utils.integrations import check_deepspeed_zero3_enabled, gather_params_ctx + + +class TrainableTokensLayer(nn.Module, BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("trainable_tokens_delta",) + + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("token_indices", "trainable_tokens_original") + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + token_indices: list[int], + tied_adapter: Optional[TrainableTokensLayer] = None, + **kwargs, + ) -> None: + super().__init__() + + self.base_layer = base_layer + self._active_adapter = adapter_name + self.kwargs = kwargs + + # wrap the tied adapter in a list so that it is excluded from .(named_)modules() and, therefore, + # not included in the state dict since it would be a copy of the tied adapter anyway. + self._tied_adapter = [tied_adapter] if tied_adapter else [] + + # we store the updated weights of particular tokens and their originals. we assume + # that the count of new tokens is far smaller than the number of total tokens. + # + # In case we have weight tying with another token adapter, we'll have no actual + # references on our own but use everything from the tied adapter. + if not self.tied_adapter: + self.trainable_tokens_delta = nn.ParameterDict({}) + self.trainable_tokens_original = BufferDict({}) + self.token_indices = {} + else: + self.trainable_tokens_delta = self.tied_adapter.trainable_tokens_delta + self.trainable_tokens_original = self.tied_adapter.trainable_tokens_original + self.token_indices = self.tied_adapter.token_indices + + # Mark the weight as unmerged + self.merged_adapters = [] + + in_features, out_features = _get_in_out_features(self.get_base_layer()) + self.in_features = in_features + self.out_features = out_features + + @property + def tied_adapter(self): + if self._tied_adapter: + return self._tied_adapter[0] + return None + + def _collect_token_weights(self, weight: torch.Tensor, rows: torch.Tensor, embed_dim: int) -> torch.Tensor: + """DeepSpeed zero3 specific code to initialize trainable tokens. + + Ensures that only the necessary weights are collected to a single rank, initialized, and then shared with all + ranks. + """ + src_rank = 0 + # right now, only CUDA is implemented + device = torch.device("cuda", torch.cuda.current_device()) + + with gather_params_ctx([weight], modifier_rank=None): + if dist.get_rank() == src_rank: + token_weights = weight[rows].clone() + else: + # build an empty tensor with correct shape/type/device + token_weights = torch.empty( + (len(rows), embed_dim), + dtype=weight.dtype, + device=device, + ) + + # share the weights with all ranks + dist.broadcast(token_weights, src=src_rank) + return token_weights + + def update_layer(self, adapter_name, **kwargs): + if kwargs.get("tied_adapter", None): + # as a tied adapter, we're just following whatever the adpater we're tied to does, we don't update anything. + return + + self.token_indices[adapter_name] = kwargs["token_indices"] + init_weights = kwargs.get("init_weights", True) + + # we initialize the delta embedding weights from the base embedding matrix and replace values instead of + # adding/subtracting deltas. we do it this way and use `embedding.weight.index_copy()` to write the updated + # values during `forward()` to avoid that the user resizing the embedding matrix, effectively filling the new + # token space with random values, training the model with TrainableTokensLayer, initializing the model anew - + # thus re-initializing the new embeddings again with new random variables. If we would add/subtract deltas + # onto the new values, we would get undefined behavior. By replacing the specific token values we always + # get defined behavior. + weight = self.get_base_layer().weight + embed_dim = self.get_base_layer().embedding_dim + + if init_weights: + if check_deepspeed_zero3_enabled(): + values = self._collect_token_weights(weight, self.token_indices[adapter_name], embed_dim) + else: + values = self.weight[self.token_indices[adapter_name]] + else: + # random init with matching dtype/device + values = torch.randn( + (len(self.token_indices[adapter_name]), embed_dim), + dtype=weight.dtype, + device=weight.device, + ) + + self.trainable_tokens_delta[adapter_name] = nn.Parameter(values.clone(), requires_grad=True) + self.trainable_tokens_original[adapter_name] = values.clone() + + self._move_adapter_to_device_of_base_layer(adapter_name) + + def _check_overlapping_tokens(self, adapter_names): + """Raises an error if the token indices of the given adapter names are overlapping. + This is currently not supported and can lead to undefined behavior of the model if no specific merging between + the overlapping indices' values is applied. + """ + if len(adapter_names) <= 1: + return + + indices = set() + + # we take already merged adapters into account as well since they can be overridden by new adapters as well. + for adapter_name in set(adapter_names + self.merged_adapters): + index_set = set(self.token_indices[adapter_name]) + if len(indices.intersection(index_set)): + raise ValueError( + f"Token indices of adapter {adapter_name} are already defined and would result in " + "undefined merging behavior. Only disjunct token indices are currently supported." + ) + indices.update(index_set) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + adapter_names = check_adapters_to_merge(self, adapter_names) + + if not adapter_names: + # no adapter to merge + return + + self._check_overlapping_tokens(adapter_names) + + merged = self.base_layer.weight.data + + for adapter_name in adapter_names: + index = torch.tensor(self.token_indices[adapter_name]).to(merged.device) + deltas = self.trainable_tokens_delta[adapter_name].to(merged) + merged = merged.index_copy(dim=0, index=index, source=deltas) + + if safe_merge and not torch.isfinite(merged).all(): + raise ValueError(f"NaNs detected in the merged weights. The adapter {adapter_name} seems to be broken") + + self.base_layer.weight.data = merged + self.merged_adapters.extend(adapter_names) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + adapter_name = self.merged_adapters.pop() + + index = torch.tensor(self.token_indices[adapter_name]).to(self.base_layer.weight.device) + originals = self.trainable_tokens_original[adapter_name].to(self.base_layer.weight) + self.base_layer.weight.data.index_copy_(dim=0, index=index, source=originals) + + def get_merged_weights(self, active_adapters): + W = self.base_layer.weight + + for adapter_name in active_adapters: + index = torch.tensor(self.token_indices[adapter_name]).to(W.device) + deltas = self.trainable_tokens_delta[adapter_name].to(W) + W = W.index_copy(dim=0, index=index, source=deltas) + + return W + + def forward_adapters(self, x: torch.Tensor, active_adapters, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters or not active_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + self._check_overlapping_tokens(active_adapters) + + W = self.get_merged_weights(active_adapters) + + # Normally it should be very clear that we're wrapping Embedding layers but there are cases, such as + # tying weights with an LM head where the layer we wrap is a Linear layer. Therefore we must choose + # accordingly. + # + # TODO: the isinstance checks, especially the one for nn.Linear, may not hold for quantized layers; + # TODO: we may need to find a better way to detect quantized layers. + if isinstance(self.base_layer, torch.nn.Embedding): + result = F.embedding( + input=x, + weight=W, + padding_idx=self.base_layer.padding_idx, + max_norm=self.base_layer.max_norm, + norm_type=self.base_layer.norm_type, + scale_grad_by_freq=self.base_layer.scale_grad_by_freq, + sparse=self.base_layer.sparse, + ) + elif isinstance(self.base_layer, torch.nn.Linear): + # Probably a tied adapter that wraps an LM head. + result = F.linear( + input=x, + weight=W, + ) + else: + raise ValueError( + "TrainableTokensLayer wraps an unknown layer type, maybe you are targeting the wrong layer?" + ) + + return result + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + return self.forward_adapters(x, self.active_adapters, *args, **kwargs) diff --git a/peft/src/peft/tuners/trainable_tokens/model.py b/peft/src/peft/tuners/trainable_tokens/model.py new file mode 100644 index 0000000000000000000000000000000000000000..50abef90f5120712a65160eb70b091e6854c64e9 --- /dev/null +++ b/peft/src/peft/tuners/trainable_tokens/model.py @@ -0,0 +1,139 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import torch.nn as nn + +from peft.config import PeftConfig +from peft.tuners.tuners_utils import BaseTuner +from peft.utils import _get_input_embeddings_name, _get_submodules + +from .layer import TrainableTokensLayer + + +class TrainableTokensModel(BaseTuner): + prefix: str = "trainable_tokens_" + tuner_layer_cls = TrainableTokensLayer + + def _prepare_adapter_config(self, peft_config, model_config): + # target_modules can be none which prompts us to infer the embedding layer name ourselves. + if peft_config.target_modules is None: + peft_config.target_modules = _get_input_embeddings_name(self.model, "embed_tokens") + + return peft_config + + def inject_adapter( + self, + model: nn.Module, + adapter_name: str, + autocast_adapter_dtype: bool = True, + low_cpu_mem_usage: bool = False, + **kwargs, + ) -> None: + super().inject_adapter( + model=model, + adapter_name=adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + **kwargs, + ) + + model_config = self.get_model_config(self) + + # In case of weight-tying we need to adapt the tied weights as well and use tie the embedding adapter. + # + # The TrainableTokensLayer supports being tied to another TrainableTokensLayer meaning that the layer will + # not do any changes on its own but solely rely on the weights from the tied adapter. We will search for the + # tied weights and put tied TrainableTokensLayer adapters on them, all tied to the adapter of the embedding + # matrix. + if ( + model_config.get("tie_word_embeddings", False) + # some models may be misconfigured to have weight tying enabled but don't define tied weights keys + and self.model._tied_weights_keys is not None + and isinstance(self.model.get_input_embeddings(), TrainableTokensLayer) + ): + module_keys = [".".join(n.split(".")[:-1]) for n in self.model._tied_weights_keys] + # disable removing of duplicates since we're essentially only dealing with duplicates (i.e. tied weights) + for name, module in self.model.named_modules(remove_duplicate=False): + matched_keys = [target_key for target_key in module_keys if name.endswith(target_key)] + if matched_keys: + parent, target, target_name = _get_submodules(model, name) + + peft_config = self.peft_config[adapter_name].to_dict() + peft_config["tied_adapter"] = self.model.get_input_embeddings() + + self._create_and_replace_dict( + peft_config, + adapter_name, + target, + target_name, + parent, + matched_keys[0], + ) + + def _get_tied_target_modules(self, *args, **kwargs): + # Normally this method would return the layers that target tied layers. + # + # We override this method since we explicitly support tied weights tied to the embedding layer. + # Therefore, we don't need the warning issued by returning the modules here. + return [] + + def _create_and_replace_dict( + self, + peft_config: dict, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + The same as `_create_and_replace` but takes a dictionary instead of a peft config so that we can add keys that + are not present in the config, such as `tied_adapter`. + """ + kwargs = peft_config + + if isinstance(target, TrainableTokensLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(peft_config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) + + def _create_and_replace( + self, + peft_config: PeftConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + kwargs = peft_config.to_dict() + self._create_and_replace_dict(kwargs, adapter_name, target, target_name, parent, current_key) + + @staticmethod + def _create_new_module(peft_config, adapter_name, target, **kwargs): + new_module = TrainableTokensLayer(target, adapter_name, **kwargs) + new_module.update_layer( + adapter_name, + init_weights=kwargs["init_weights"], + token_indices=kwargs["token_indices"], + tied_adapter=kwargs.get("tied_adapter", None), + ) + + return new_module diff --git a/peft/src/peft/tuners/tuners_utils.py b/peft/src/peft/tuners/tuners_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..66903296a8f834010dd443491cc38128f2560eb8 --- /dev/null +++ b/peft/src/peft/tuners/tuners_utils.py @@ -0,0 +1,1930 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +import dataclasses +import os +import re +import textwrap +import warnings +from abc import ABC, abstractmethod +from collections.abc import Sequence +from contextlib import contextmanager, nullcontext +from typing import Any, Optional, Union, overload + +import torch +from accelerate.hooks import AlignDevicesHook +from accelerate.utils import named_module_tensors, offload_state_dict +from packaging import version +from torch import nn +from tqdm import tqdm +from transformers import PreTrainedModel +from transformers.pytorch_utils import Conv1D + +from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING +from peft.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND +from peft.utils.constants import ( + DUMMY_MODEL_CONFIG, + DUMMY_TARGET_MODULES, + EMBEDDING_LAYER_NAMES, + MIN_TARGET_MODULES_FOR_OPTIMIZATION, + SEQ_CLS_HEAD_NAMES, +) +from peft.utils.integrations import init_empty_weights +from peft.utils.other import ( + AuxiliaryTrainingWrapper, + _set_adapter, + match_target_against_key, + set_additional_trainable_modules, +) +from peft.utils.peft_types import PeftType, TaskType + +from ..config import PeftConfig +from ..utils import _get_submodules +from ._buffer_dict import BufferDict + + +@contextmanager +def onload_layer(layer): + r""" + A utility for modifying a module containing one or more tuners and a base layer, any of which are offloaded to the + CPU or disk. Moves a module's sub-modules to the execution device before some action is performed, after that the + base layer state dictionary is re-assigned (if that layer was offloaded to the disk) and finally the parameters are + offloaded. + + If the module has no offloaded sub-modules, this function does nothing. + + Args: + layer ('torch.nn.Module'): + layer with tuners to be merged + """ + + offloaded_modules = [] + for name, module in layer.named_modules(): + if name in ["", "base_layer"]: + continue + if hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload: + module._hf_hook.pre_forward(module) + offloaded_modules.append(module) + + base_layer_offload = False + if hasattr(layer, "base_layer") and ( + hasattr(layer.base_layer, "_hf_hook") + and isinstance(layer.base_layer._hf_hook, AlignDevicesHook) + and layer.base_layer._hf_hook.offload + ): + # check if the base layer is disk-offloaded (must contain a 'dataset' and an offload index) + if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values() and hasattr( + layer.base_layer._hf_hook.weights_map, "dataset" + ): + # find the disk-offload index (maps modules to safetensors) from the `dataset` (OffloadedWeightsLoader object) + index = layer.base_layer._hf_hook.weights_map.dataset.index + module_name = list(dict(layer.base_layer._hf_hook.weights_map.dataset).keys())[0] # any module will do + file_name = index[module_name]["safetensors_file"] + base_name_arr = [] + # get effective dir name + for i in os.path.split(file_name): + if "--" in i: + base_name_arr.append(i) + break + base_name_arr.append(i) + base_name = os.path.join(*base_name_arr) + safetensors_filename = base_name + "-merged" + layer.base_layer._hf_hook.pre_forward(layer.base_layer) + base_layer_offload = True + + yield + + for module in offloaded_modules: + module._hf_hook.post_forward(module, torch.tensor([])) + + if base_layer_offload: + # re-make weights map (must be on cpu to send params to the disk via memmap if disk offload) + layer.base_layer._hf_hook.weights_map = { + name: param.to("cpu") for name, param in named_module_tensors(layer.base_layer) + } + # offload weights map to disk if original device is the disk + if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values() and hasattr( + layer.base_layer._hf_hook.weights_map, "dataset" + ): + # rewrite directory with merged weights + offload_state_dict(safetensors_filename, layer.base_layer._hf_hook.weights_map) + layer.base_layer._hf_hook.post_forward(layer.base_layer, torch.tensor([])) + + +def _check_lora_target_modules_mamba(peft_config: PeftConfig, model: nn.Module, target_name: str): + """ + Prevent applying LoRA to incompatible modules in specific architectures (e.g., Mamba). + """ + + lora_like_types = {"LORA", "ADALORA", "XLORA", "RANDLORA"} + incompatible_modules = {"out_proj", "conv1d"} + mamba_model_types = {"falcon_h1", "mamba", "mamba2", "falcon_mamba"} + + if ( + peft_config.peft_type in lora_like_types + and hasattr(model, "config") + and getattr(model.config, "model_type", None) in mamba_model_types + ): + if target_name in incompatible_modules: + raise ValueError( + f"[PEFT:{peft_config.peft_type}] Module '{target_name}' is incompatible with Mamba-based models " + f"(model_type='{model.config.model_type}'). Incompatible modules: {incompatible_modules}. " + "Please remove it from `target_modules` to avoid compatibility issues." + ) + + +def _get_in_out_features(module: nn.Module) -> tuple[int, int] | tuple[None, None]: + """ + Get the in_features and out_features of the layer. + + Returns in_features and out_features as a tuple. If they cannot be determined, return a tuple of None and None. + This function covers a broad range of layers, some of which the caller might not support. Therefore, just because + this function returns a valid result does not imply that the layer type is supported. + """ + if isinstance(module, nn.Linear): + torch_supports_dtensor = version.parse(torch.__version__) >= version.parse("2.5.0") + if torch_supports_dtensor and isinstance(module.weight, torch.distributed.tensor.DTensor): + # If Tensor Parallel is used, the weight is sharded, so we need to get the local shape + out_features, in_features = module.weight.to_local().shape + else: + in_features, out_features = module.in_features, module.out_features + elif isinstance(module, nn.Conv1d): + in_features, out_features = module.in_channels, module.out_channels + elif isinstance(module, nn.Conv2d): + in_features, out_features = module.in_channels, module.out_channels + elif isinstance(module, nn.Conv3d): + in_features, out_features = module.in_channels, module.out_channels + elif isinstance(module, nn.Embedding): + in_features, out_features = module.num_embeddings, module.embedding_dim + elif isinstance(module, Conv1D): + in_features, out_features = ( + module.weight.ds_shape if hasattr(module.weight, "ds_shape") else module.weight.shape + ) + elif isinstance(module, nn.MultiheadAttention): + if not module._qkv_same_embed_dim: + raise ValueError("Only same dim for query/key/value is supported as of now for MultiheadAttention.") + in_features, out_features = module.embed_dim, 3 * module.embed_dim + elif hasattr(module, "infeatures") and hasattr(module, "outfeatures"): + # QuantLinear + in_features, out_features = module.infeatures, module.outfeatures + elif hasattr(module, "input_size") and hasattr(module, "output_size"): + # Megatron ColumnParallelLinear,RowParallelLinear + in_features, out_features = module.input_size, module.output_size + elif hasattr(module, "codebooks") and module.__class__.__name__ == "QuantizedLinear": + # AQLM QuantLinear + in_features, out_features = module.in_features, module.out_features + elif hasattr(module, "w_bit") and module.__class__.__name__ == "WQLinear_GEMM": + # Awq layers + in_features, out_features = module.in_features, module.out_features + elif module.__class__.__name__ == "EetqLinear": + # Eetq layers + in_features, out_features = module.in_features, module.out_features + elif hasattr(module, "W_q") and module.__class__.__name__ == "HQQLinear": + # HQQ layers + in_features, out_features = module.in_features, module.out_features + elif module.__class__.__name__ == "PatchedLinear": + # INC layers + in_features, out_features = module.in_features, module.out_features + else: + # possibly support user provided custom layer types using dynamic dispatch + if hasattr(module, "in_features") and hasattr(module, "out_features"): + in_features, out_features = module.in_features, module.out_features + else: + in_features, out_features = None, None + warnings.warn(f"Unsupported layer type '{type(module)}' encountered, proceed at your own risk.", UserWarning) + return in_features, out_features + + +class BaseTuner(nn.Module, ABC): + r""" + A base tuner model that provides the common methods and attributes for all tuners that are injectable into a + torch.nn.Module + + For adding a new Tuner class, one needs to overwrite the following methods: + + - **_prepare_adapter_config**: + A private method to eventually prepare the adapter config, for example in case the field `target_modules` is + missing. + - **_create_and_replace**: + A private method to create and replace the target module with the adapter module. + - **_check_target_module_exists**: + A private helper method to check if the passed module's key name matches any of the target modules in the + adapter_config. + + The easiest is to check what is done in the `peft.tuners.lora.LoraModel` class. + + Attributes: + model (`torch.nn.Module`): + The model to which the adapter tuner layers will be attached. + forward (`Callable`): + The forward method of the model. + peft_config (`Union[`PeftConfig`, dict[str, PeftConfig]]`): + The adapter configuration object, it should be a dictionary of `str` to `PeftConfig` objects. One can also + pass a PeftConfig object and a new adapter will be created with the default name `adapter` or create a new + dictionary with a key `adapter_name` and a value of that peft config. + config (`dict[str, Any]`): + The model configuration object, it should be a dictionary of `str` to `Any` objects. + targeted_module_names (`list[str]`): + The list of module names that were actually adapted. Can be useful to inspect if you want to quickly + double-check that the `config.target_modules` were specified correctly. + targeted_parameter_names (`list[str]`): + The list of parameter names that were actually adapted. Can be useful to inspect if you want to quickly + double-check that the `config.target_parameters` were specified correctly. + prefix (`str`) + The PEFT-method specific unique prefix. E.g. `"lora_"` for LoRA. + """ + + # Required attributes for child classes: + + # The unique prefix for this PEFT method, e.g. 'lora_' for LoRA. + prefix: str + # The class of the tuner layer, e.g. `LoraLayer` for LoRA. + tuner_layer_cls: type[BaseTunerLayer] + # The default target modules for various transformers model architectures, like Llama. This is useful to allow users + # to skip specifying the `target_modules` in the config of the PEFT method. The default is often something like + # `{'llama': ['q_proj', 'v_proj'], ...}`. + target_module_mapping: dict[str, list[str]] + + def __init__( + self, + model, + peft_config: Union[PeftConfig, dict[str, PeftConfig]], + adapter_name: str, + low_cpu_mem_usage: bool = False, + state_dict: Optional[dict[str, torch.Tensor]] = None, + ) -> None: + super().__init__() + + self.model = model + self.targeted_module_names: list[str] = [] + self.targeted_parameter_names: list[str] = [] + + # For advanced developers, if you want to attach multiple adapters to your + # model, just add a `peft_config` dict attribute to your model. + if not hasattr(self, "peft_config"): + self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config + else: + warnings.warn( + "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters" + " in the model. Make sure to know what you are doing!" + ) + if isinstance(peft_config, PeftConfig): + self.peft_config[adapter_name] = peft_config + else: + # user is adding a dict of PeftConfigs + self.peft_config.update(peft_config) + + self.active_adapter: str | list[str] = adapter_name + self._pre_injection_hook(self.model, self.peft_config[adapter_name], adapter_name) + if peft_config != PeftType.XLORA or peft_config[adapter_name] != PeftType.XLORA: + self.inject_adapter(self.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage, state_dict=state_dict) + + # Copy the peft_config in the injected model. + self.model.peft_config = self.peft_config + + @property + def active_adapters(self) -> list[str]: + if isinstance(self.active_adapter, str): + return [self.active_adapter] + # is already a list of str + return self.active_adapter + + def forward(self, *args: Any, **kwargs: Any): + return self.model.forward(*args, **kwargs) + + def _pre_injection_hook(self, model: nn.Module, config: PeftConfig, adapter_name: str) -> None: + r""" + A hook to be called before the adapter is injected into the model. This method can be overridden by child + classes to perform any pre-injection operations. + + Args: + model (`nn.Module`): + The model to be adapted. + config (`PeftConfig`): + The adapter config. + adapter_name (`str`): + The adapter name. + """ + pass + + def _prepare_adapter_config(self, peft_config: PeftConfig, model_config: dict) -> PeftConfig: + r""" + A private method to prepare the adapter config. + + For transformers based models, if `peft_config.target_modules` is None, for some model architectures, we can + automatically infer the target modules from the `TRANSFORMERS_MODELS_TO_XXX_TARGET_MODULES_MAPPING`. + + Args: + peft_config (`PeftConfig`): + The adapter config. + model_config (`dict`): + The transformers model config, that config should contain the `model_type` key. + + Returns: + peft_config (`PeftConfig`): + The PEFT config with updated `target_modules`. + + Raises: + ValueError: + Raises an error if the model type was not recognized. + """ + if peft_config.target_modules is None: + target_modules = self.target_module_mapping.get(model_config["model_type"]) + if target_modules is None: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set(target_modules) + return peft_config + + def _prepare_model(self, peft_config: PeftConfig, model: nn.Module): + r""" + A private method to modify the model structure before adapter is applied. + + See `peft.tuner.lora.LoraModel._prepare_model` for an example. + + Args: + peft_config (`PeftConfig`): + The prepared adapter config. + model (`nn.Module`): + The model that is going to be adapted. + """ + pass + + @staticmethod + def _check_target_module_exists(peft_config: PeftConfig, key: str) -> bool | re.Match[str] | None: + """ + A helper method to check if the passed module's key name matches any of the target modules in the + adapter_config. + + Args: + config (`PeftConfig`): + A config to match target modules from. + key (`str`): + A key to search any matches in config. + + Returns: + `bool` | `re.Match[str]` | `None`: + True or re.Match object if key matches any target modules from config, False or None if no match found. + """ + return check_target_module_exists(peft_config, key) + + @abstractmethod + def _create_and_replace( + self, + peft_config: PeftConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + current_key: str, + parameter_name: Optional[str] = None, + ) -> None: + r""" + Inplace replacement of the target module with the adapter layer. This method needs to be overridden by all the + tuner classes. + + Check `peft.tuners.lora.LoraModel._create_and_replace` for an example. + + Args: + peft_config (`PeftConfig`): + The adapter config. + adapter_name (`str`): + The adapter name. + target (`nn.Module`): + The target module. + target_name (`str`): + The target module's name. + parent (`nn.Module`): + The parent module. + current_key (`str`): + The key of the current target being adapted. + parameter_name (`str`, *optional*) + If, and only if, an `nn.Parameter` is being targeted, this is the name of the parameter. + """ + ... + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + """ + A helper method to mark only the adapter layers as trainable (i.e. module.requires_grad = False). + """ + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = getattr(self.peft_config[active_adapter], "bias", "none") + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias.endswith("_only"): # e.g. "lora_only" or "boft_only" + for m in model.modules(): + if isinstance(m, self.tuner_layer_cls) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") + + def _set_adapter_layers(self, enabled: bool = True) -> None: + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)): + module.enable_adapters(enabled) + + def disable_adapter_layers(self) -> None: + """ + Disable all adapters in-place. + + When disabling all adapters, the model output corresponds to the output of the base model. + """ + # TODO: deprecate in favor of enable_adapters + for active_adapter in self.active_adapters: + bias_val = getattr(self.peft_config[active_adapter], "bias", "none") + if bias_val != "none": + msg = ( + f"Careful, disabling adapter layers with bias configured to be '{bias_val}' does not produce the " + "same output as the base model would without adaption." + ) + warnings.warn(msg) + self._set_adapter_layers(enabled=False) + + def enable_adapter_layers(self) -> None: + """ + Enable all adapters in-place + """ + # TODO: deprecate in favor of enable_adapters + self._set_adapter_layers(enabled=True) + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + new_adapter = delete_adapter( + model=self.model, adapter_name=adapter_name, prefix=self.prefix, layer_cls=self.tuner_layer_cls + ) + self.active_adapter = new_adapter or [] + + def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None: + """ + Enable or disable gradients on the given adapter(s). + + Args: + adapter_name (`str` or `Sequence[str]`): + The name of the adapter(s) whose gradients should be enabled/disabled. + requires_grad (`bool`, *optional*) + Whether to enable (`True`, default) or disable (`False`). + """ + set_requires_grad(self.model, adapter_names=adapter_names, requires_grad=requires_grad) + + def _check_new_adapter_config(self, config: PeftConfig) -> None: + """ + A helper method to check the config of a new adapter being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + if len(self.peft_config) <= 1: + return + + # It is assumed that the config was added to self.peft_config *before* calling this check. We should thus never + # encounter the error below. Still, it is better to verify this, or else subsequent checks could be incorrect. + if not any(conf is config for conf in self.peft_config.values()): + raise ValueError( + "_check_new_peft_config was called incorrectly, this should not happen. Please open an issue and " + "report the error: https://github.com/huggingface/peft/issues" + ) + + bias_values = [getattr(conf, "bias", "none") for conf in self.peft_config.values()] + if sum(bias_value != "none" for bias_value in bias_values) > 1: + raise ValueError( + f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " + "set bias to 'none' for all adapters." + ) + + def _cast_adapter_dtype(self, adapter_name: str, autocast_adapter_dtype: bool = True) -> None: + """ + A helper method to cast the adapter weights to the correct dtype. + + Currently, this only upcasts float16 and bfloat16 to float32. + + Args: + adapter_name (`str`): + The adapter name. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. + + """ + cast_adapter_dtype(self.model, adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype) + + def _check_merge_allowed(self): + """Helper method to check whether the adapter can be merged. + + Raise a ValueError if it is not possible to merge the adapter with the given configuration. + """ + example_code = textwrap.dedent( + """ + ```python + from transformers import AutoModelForCausalLM + + # Load original tied model + model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", tie_word_embeddings=False) + + # Set the randomly initialized lm_head to the previously tied embeddings + model.lm_head.weight.data = model.model.embed_tokens.weight.data.clone() + + # Save the untied model + untied_model_dir = "dir/for/untied/model" + model.save_pretrained(untied_model_dir) + model.config.save_pretrained(untied_model_dir) + + # Now use the original model but in untied format + model = AutoModelForCausalLM.from_pretrained(untied_model_dir) + ``` + """ + ) + tied_target_modules = self._get_tied_target_modules(self.model) + if tied_target_modules: + warnings.warn( + f"Model with `tie_word_embeddings=True` and the {tied_target_modules=} are part of the adapter. " + "This can lead to complications. " + "You can opt to merge the adapter after cloning the weights (to untie the embeddings). " + "You can untie the embeddings by loading the model with `tie_word_embeddings=False`. For example:" + + example_code + ) + + def _unload_and_optionally_merge( + self, + merge: bool = True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ) -> None: + if merge: + self._check_merge_allowed() + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + desc = "Unloading " + ("and merging " if merge else "") + "model" + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + with onload_layer(target): + if hasattr(target, "unload_and_optionally_merge_module"): + # if layers have special unloading method, like MultiheadAttention, use that + unloaded_module = target.unload_and_optionally_merge_module( + merge=merge, safe_merge=safe_merge, adapter_names=adapter_names + ) + self._replace_module(parent, target_name, unloaded_module, target) + elif hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + + return self.model + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ) -> torch.nn.Module: + r""" + This method merges the adapter layers into the base model. + + This is needed if someone wants to use the base model as a standalone model. The returned model has the same + architecture as the original base model. + + It is important to assign the returned model to a variable and use it, this is not an in-place operation! + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process (default: False). + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + + >>> model_id = ... + >>> base_model = AutoModelForCausalLM.from_pretrained(model_id) + >>> peft_model_id = ... + >>> model = PeftModel.from_pretrained(base_model, peft_model_id) + >>> merged_model = model.merge_and_unload() + ``` + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self) -> torch.nn.Module: + """ + Return the base model by removing all the PEFT modules. + + It is important to assign the returned model to a variable and use it, this is not an in-place operation! + """ + return self._unload_and_optionally_merge(merge=False) + + def _check_target_module_compatiblity(self, peft_config: PeftConfig, model: nn.Module, target_name: str): + """ + Prevent applying LoRA to incompatible modules in specific architectures (e.g., Mamba). + """ + _check_lora_target_modules_mamba(peft_config, model, target_name) + + def _create_and_replace_parameter( + self, peft_config, adapter_name, target, target_name, parent, current_key + ) -> None: + raise NotImplementedError(f"{self.__class__.__name__} does not support targeting nn.Parameter.") + + def inject_adapter( + self, + model: nn.Module, + adapter_name: str, + autocast_adapter_dtype: bool = True, + low_cpu_mem_usage: bool = False, + state_dict: Optional[dict[str, torch.Tensor]] = None, + ) -> None: + r""" + Creates adapter layers and replaces the target modules with the adapter layers. This method is called under the + hood by `peft.mapping.get_peft_model` if a non-prompt tuning adapter class is passed. + + The corresponding PEFT config is directly retrieved from the `peft_config` attribute of the BaseTuner class. + + Args: + model (`nn.Module`): + The model to be tuned. + adapter_name (`str`): + The adapter name. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + state_dict (`dict`, *optional*, defaults to `None`) + If a state_dict is passed here, the adapters will be injected based on the entries of the state_dict. + This can be useful when the exact `target_modules` of the PEFT method is unknown, for instance because + the checkpoint was created without meta data. Note that the values from the state_dict are not used, + only the keys are used to determine the correct layers that should be adapted. + + """ + ################################### + # PREPARATION OF MODEL AND CONFIG # + ################################### + + peft_config = self.peft_config[adapter_name] + excluded_modules = [] + unmatched_modules = [] + targeted_modules_from_peft_config: list[str] = [] # only relevant if state_dict is passed + # Note: If possible, all checks should be performed *at the start of this method*. + # This way, we can raise early if something goes wrong, without leaving the model + # in a bad (half-initialized) state. + self._check_new_adapter_config(peft_config) + + model_config = self.get_model_config(model) + + peft_config = self._prepare_adapter_config(peft_config, model_config) + + self._prepare_model(peft_config, model) + + if getattr(peft_config, "target_parameters", []) and state_dict: + raise ValueError( + "Trying to inject a PEFT adapter from a state_dict but the PEFT config uses `target_parameters`. This " + "is not supported -- when using `target_parameters`, please inject the adapter without the state_dict." + ) + + named_modules = list(model.named_modules()) + key_list = [key for key, _ in named_modules] + + uses_dummy_target_modules = getattr(peft_config, "target_modules", None) == DUMMY_TARGET_MODULES + if uses_dummy_target_modules: + # dummy adapter, we allow not matching any module + named_modules = [] + key_list = [] + + # update peft_config.target_modules if required + peft_config = _maybe_include_all_linear_layers(peft_config, model) + + # This is an optimization to reduce the number of entries in the target_modules list. The reason is that in some + # circumstances, target_modules can contain hundreds of entries. Since each target module is checked against + # each module of the net (which can be thousands), this can become quite expensive when many adapters are being + # added. Often, the target_modules can be condensed in such a case, which speeds up the process. + # A context in which this can happen is when diffusers loads non-PEFT LoRAs. As there is no meta info on + # target_modules in that case, they are just inferred by listing all keys from the state_dict, which can be + # quite a lot. See: https://github.com/huggingface/diffusers/issues/9297 + # As there is a small chance for undiscovered bugs, we apply this optimization only if the list of + # target_modules is sufficiently big. + # We also exclude IA³ from this optimization. This is because IA³ has both target_modules and + # feedforward_modules, which are coupled (the latter must be a subset). It would be possible to change the logic + # to keep both in sync, but it's not quite trivial and probably not worth the effort. See #2429. + if ( + isinstance(peft_config.target_modules, (list, set)) + and (len(peft_config.target_modules) >= MIN_TARGET_MODULES_FOR_OPTIMIZATION) + and (peft_config.peft_type != PeftType.IA3) + ): + suffixes = tuple("." + suffix for suffix in peft_config.target_modules) + names_no_target = [ + name for name in key_list if (name not in peft_config.target_modules) and not name.endswith(suffixes) + ] + new_target_modules = _find_minimal_target_modules(peft_config.target_modules, names_no_target) + if len(new_target_modules) < len(peft_config.target_modules): + peft_config.target_modules = new_target_modules + + ############################### + # MATCHING & CREATING MODULES # + ############################### + + existing_adapter_prefixes = [] + for key, module in named_modules: + if isinstance(module, BaseTunerLayer): + existing_adapter_prefixes.append(key + ".") + + # TODO: check if this the most robust way + module_names: set[str] = set() + if state_dict is not None: + prefix = PEFT_TYPE_TO_PREFIX_MAPPING[peft_config.peft_type] + module_names = {k.rsplit("." + prefix, 1)[0] for k in state_dict} + + for key, module in named_modules: + if not key: + continue + + # It is possible that we're adding an additional adapter, so if we encounter a key that clearly belongs to a + # previous adapter we can skip here since we don't want to interfere with adapter internals. + for adapter_key in existing_adapter_prefixes: + if key.startswith(adapter_key): + excluded_modules.append(key) + break + + if excluded_modules and excluded_modules[-1] == key: + continue + + if state_dict is None: + # normal mechanism: match the modules using the peft_config + result = self._check_target_module_exists(peft_config, key) + if isinstance(result, _ExcludedModule): + excluded_modules.append(key) + elif not result: + unmatched_modules.append(key) + else: + self.targeted_module_names.append(key) + parent, target, target_name = _get_submodules(model, key) + self._check_target_module_compatiblity(peft_config, model, target_name) + ctx = init_empty_weights if low_cpu_mem_usage else nullcontext + with ctx(): + self._create_and_replace( + peft_config, adapter_name, target, target_name, parent, current_key=key + ) + else: + # use the state_dict to match modules instead + if key not in module_names: + unmatched_modules.append(key) + else: + self.targeted_module_names.append(key) + parent, target, target_name = _get_submodules(model, key) + self._check_target_module_compatiblity(peft_config, model, target_name) + ctx = init_empty_weights if low_cpu_mem_usage else nullcontext + with ctx(): + self._create_and_replace( + peft_config, adapter_name, target, target_name, parent, current_key=key + ) + + # still record what would have been matched via the config so that the two results can be compared + if self._check_target_module_exists(peft_config, key): + targeted_modules_from_peft_config.append(key) + + if getattr(peft_config, "target_parameters", []): + # Note: We don't need to check for no state_dict being passed, since we already checked this earlier. + self._inject_parameters( + peft_config=peft_config, model=model, adapter_name=adapter_name, low_cpu_mem_usage=low_cpu_mem_usage + ) + + #################### + # CHECK FOR ERRORS # + #################### + + if state_dict is not None: + # in case that the state_dict was used as source of truth and it resulted in different outcomes than what + # would have been matched with the PEFT config, warn the user about that. + targeted_set_from_peft_config = set(targeted_modules_from_peft_config) + targeted_set_from_state_dict = set(self.targeted_module_names) + diff_peft_config = targeted_set_from_peft_config - targeted_set_from_state_dict + diff_state_dict = targeted_set_from_state_dict - targeted_set_from_peft_config + warning_msg = "" + if diff_peft_config or diff_state_dict: + warning_msg = ( + "While injecting the PEFT adapters, an inconsistency was discovered between the PEFT config and " + "the provided state_dict. This is not necessarily an issue and can be ignored if this was the " + "intent. " + ) + if diff_peft_config: + warning_msg += ( + f"The PEFT config contained these additional target modules: {sorted(diff_peft_config)}. " + ) + if diff_state_dict: + warning_msg += f"The state_dict contained these additional target modules: {sorted(diff_state_dict)}. " + if warning_msg: + warnings.warn(warning_msg, RuntimeWarning) + + if not self.targeted_module_names and not self.targeted_parameter_names and not uses_dummy_target_modules: + if excluded_modules and not unmatched_modules: + # All targeted modules were excluded + raise ValueError( + "All modules were excluded. This is likely unintended. " + "Check your `target_modules`, `exclude_modules` and `modules_to_save` configuration." + ) + elif not excluded_modules and unmatched_modules and not peft_config.target_modules: + raise ValueError( + "No `target_modules` passed but also no `target_parameters` found. Please check the values for " + "these arguments." + ) + elif not excluded_modules and unmatched_modules: + # None of the targeted modules matched + error_msg = ( + f"Target modules {peft_config.target_modules} not found in the base model. " + f"Please check the target modules and try again." + ) + if getattr(peft_config, "layers_to_transform", None) is not None: + error_msg += f" Note: You specified 'layers_to_transform': {peft_config.layers_to_transform}." + if getattr(peft_config, "layers_pattern", None) is not None: + error_msg += f" You also specified 'layers_pattern': {peft_config.layers_pattern}." + raise ValueError(error_msg) + else: + # Some modules did not match and some matched but were excluded + error_msg = ( + "No modules were targeted for adaptation. " + "This might be caused by a combination of mismatched target modules and excluded modules. " + "Please check your `target_modules` and `exclude_modules` configuration. You may also have " + "only targeted modules that are marked to be saved (`modules_to_save`)." + ) + if getattr(peft_config, "layers_to_transform", None) is not None: + error_msg += f" Note: You specified 'layers_to_transform': {peft_config.layers_to_transform}." + if getattr(peft_config, "layers_pattern", None) is not None: + error_msg += f" You also specified 'layers_pattern': {peft_config.layers_pattern}." + raise ValueError(error_msg) + + elif hasattr(peft_config, "exclude_modules") and peft_config.exclude_modules and not excluded_modules: + # exclude_modules was passed but was not used + warnings.warn( + f"You have passed exclude_modules={peft_config.exclude_modules} but no modules were excluded. " + "Please check that exclude_modules was set correctly." + ) + + elif not uses_dummy_target_modules: + # If we landed here, it means that at least one module or parameter was adapted, so let's not raise an + # error. However, let's warn the user if it seems like + # - they wanted to match a module but there was no match + # - they wanted to match a parameter but there was no match + if peft_config.target_modules and not self.targeted_module_names: + warnings.warn( + f"target_modules={peft_config.target_modules} were set but no module was matched.", RuntimeWarning + ) + elif getattr(peft_config, "target_parameters", []) and not self.targeted_parameter_names: + warnings.warn( + f"target_parameters={peft_config.target_parameters} were set but no parameter was matched.", + RuntimeWarning, + ) + + tied_target_modules = self._get_tied_target_modules(model=model) + if tied_target_modules: + warnings.warn( + f"Model with `tie_word_embeddings=True` and the {tied_target_modules=} are part of the adapter. " + "This can lead to complications, for example when merging the adapter " + "or converting your model to formats other than safetensors. " + "See for example https://github.com/huggingface/peft/issues/2018." + ) + + ################ + # HOUSEKEEPING # + ################ + + # It's important to set the adapter here (again), because otherwise it can happen that if a 2nd adapter is + # added, and it targets different layer(s) than the first adapter (which is active), then those different + # layers will be activated, which we don't want. + self.set_adapter(self.active_adapters, inference_mode=peft_config.inference_mode) + self._mark_only_adapters_as_trainable(model) + + if self.peft_config[adapter_name].inference_mode: + for n, p in model.named_parameters(): + if adapter_name in n: + p.requires_grad = False + + set_additional_trainable_modules( + model=model, + peft_config=peft_config, + model_config=BaseTuner.get_model_config(self), + adapter_name=adapter_name, + activate_adapter=adapter_name in self.active_adapters, + ) + + def _inject_parameters( + self, peft_config: PeftConfig, model: nn.Module, adapter_name: str, low_cpu_mem_usage: bool + ) -> None: + """Inject layers based on peft_config.target_modules""" + + def strip_base_layer_from_name(module_name): + # It is possible that the layer is already a PEFT layer and needs updating with a new adapter. In this case, + # the name of parameter would be something like `model.layers.0.experts.base_layer.weight`, i.e. there is a + # "base_layer" inserted in the name. We need to remove that, otherwise we won't be able to match correctly + # (in this case, "experts.weight" would not match). + name = ".base_layer" + while name in module_name: + prefix, _, suffix = module_name.rpartition(name) + module_name = prefix + suffix + return module_name + + def create_and_replace_param(module_name, key, param_name): + # helper function to avoid duplication + parent, target, target_name = _get_submodules(model, module_name) + unwrapped_module_name = strip_base_layer_from_name(module_name) + unwrapped_module = model.get_submodule(unwrapped_module_name) + # use the class name for checking to avoid circular import + if isinstance(unwrapped_module, BaseTunerLayer) and unwrapped_module.__class__.__name__ != "ParamWrapper": + raise ValueError( + f"Trying to wrap an `nn.Parameter` of layer '{unwrapped_module_name}' of type " + f"{type(target).__name__}, which is not a valid target. Make sure that this layer is not " + "also targeted with `target_modules`. For some models, PEFT will do this automatically, " + "try setting `target_modules=[]` to prevent it." + ) + + self._check_target_module_compatiblity(peft_config, model, target_name) + ctx = init_empty_weights if low_cpu_mem_usage else nullcontext + with ctx(): + self._create_and_replace( + peft_config, + adapter_name, + target, + target_name, + parent, + current_key=key, + parameter_name=param_name.rpartition(".")[-1], + ) + + # TODO very simple matching, might not cover all use cases + unsorted_target_names = set(peft_config.target_parameters) + # As the order of matching can influence the nesting of multiple params on the same module, ensure determinism + # by sorting. + target_names = sorted(unsorted_target_names) + for module_name, module in model.named_modules(): + if hasattr(module, "parametrizations"): + # Deal with the case that the parameter is already parametrized. The issue is that we would not be able + # to match `f"{module_name}.{param_name}"`, as the parameter is now something like + # `module.parametrization.weight`. + for key in target_names: + target_module_name, _, param_name = key.rpartition(".") + if target_module_name != module_name: + continue + if getattr(module, param_name, None) is None: + continue + create_and_replace_param(module_name, key, param_name) + self.targeted_parameter_names.append(key) + else: + # Standard case: the parameter is not already parametrized. Note, however, that the model could already + # be nested with lora.ParamWrapper, as this is how we allow targeting multiple Parameters on the same + # module. + unwrapped_module_name = strip_base_layer_from_name(module_name) + # we're interested in finding the "lowest" module that contains the parameter, hence recurse=False + for param_name, param in module.named_parameters(recurse=False): + key = f"{unwrapped_module_name}.{param_name}" + if (key in target_names) or any(key.endswith(f".{target_key}") for target_key in target_names): + # Note: We use the unwrapped_module_name to check if the key matches, but we use the module_name for + # replacement, since we want to replace the wrapped module. + create_and_replace_param(module_name, key, param_name) + self.targeted_parameter_names.append(key) + + def _replace_module(self, parent, child_name, new_module, child) -> None: + """ + Replace the sub-module of a given moduel with a new PEFT module. + + This also deals with device placement of the new module to be in line with the child module. + + Args: + parent (`nn.Module`): + The parent module on which the replacement should take place. + child_name (`str`): + The name of the child module to be replaced. + new_module (`nn.Module`): + The new PEFT module. + child (`nn.Module`): + The original child module that is being replaced. + + """ + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + meta = torch.device("meta") + # dispatch to correct device + for name, module in new_module.named_modules(): + if self.prefix in name: + if hasattr(child, "qweight"): + weight = child.qweight + elif hasattr(child, "W_q"): + weight = child.W_q + elif hasattr(child, "weight"): + weight = child.weight + elif getattr(child, "in_proj_weight", None) is not None: # MHA + weight = child.in_proj_weight + else: + weight = next(child.parameters()) + + if not any(p.device == meta for p in module.parameters()): + module.to(weight.device) + + def merge_adapter(self, adapter_names: Optional[list[str]] = None, safe_merge: bool = False) -> None: + """ + This method merges the adapter layers into the base model. + + Merging adapters can lead to a speed up of the forward pass. A copy of the adapter weights is still kept in + memory, which is required to unmerge the adapters. In order to merge the adapter weights without keeping them + in memory, please call `merge_and_unload`. + + Args: + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + """ + # Note: The order of arguments here is: + # adapter_names, safe_merge + # For layer.merge, the order is: + # safe_merge, adapter_names + # This is not so nice but this method here started with only adapter_names, thus putting safe_merge first would + # be a backwards incompatible change. + self._check_merge_allowed() + for module in self.model.modules(): + if isinstance(module, BaseTunerLayer): + with onload_layer(module): + module.merge(adapter_names=adapter_names, safe_merge=safe_merge) + + def unmerge_adapter(self): + """ + This method unmerges all merged adapter layers from the base model. + """ + for module in self.model.modules(): + if isinstance(module, BaseTunerLayer): + with onload_layer(module): + module.unmerge() + + def set_auxiliary_adapters(self, adapter_name: str | list[str], inference_mode: bool) -> None: + """ + Sets the active adapter(s) on auxiliary modules. + + If the subclass (e.g. `LoraModel`) supports auxiliary modules like `modules_to_save`, it should call this + method in `set_adapter` to ensure that those auxiliary modules are being set correctly. + + Args: + adapter_name (`str` or `list[str]`): + The name(s) of the adapter(s) to be set as active. The adapters must be loaded first. + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + _set_adapter(self, adapter_name, inference_mode=inference_mode) + + def set_adapter(self, adapter_name: str | list[str], inference_mode: bool = False) -> None: + """Set the active adapter(s). + + Args: + adapter_name (str, list[str]): + The name(s) of the adapter(s) to set as active + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + set_adapter( + self.model, adapter_name=adapter_name, inference_mode=inference_mode, layer_cls=self.tuner_layer_cls + ) + self.active_adapter = adapter_name + + @staticmethod + def get_model_config(model: nn.Module) -> dict: + """ + This method gets the config from a model in dictionary form. If model has not attribute config, then this + method returns a default config. + + Args: + model (`nn.Module`): + Model to get the config from. + default (`dict|None`, *optional*):: + What to return if model does not have a config attribute. + """ + model_config = getattr(model, "config", DUMMY_MODEL_CONFIG) + if hasattr(model_config, "to_dict"): + model_config = model_config.to_dict() + elif dataclasses.is_dataclass(model_config): + model_config = dataclasses.asdict(model_config) + return model_config + + def _get_tied_target_modules(self, model: nn.Module) -> list[str]: + tied_target_modules = [] + model_config = self.get_model_config(model) + if model_config.get("tie_word_embeddings"): + for target_module in self.targeted_module_names: + # This potentially yields false positives since we're just looking at the layer names. So if we use a + # model that uses weight-tying of lm_head and embed_tokens, a third, unrelated, layer which is + # unfortunately named so that it is in EMBEDDING_LAYER_NAMES will be falsely reported here as well. + if target_module.split(".")[-1] in EMBEDDING_LAYER_NAMES: + tied_target_modules.append(target_module) + return tied_target_modules + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.model, name) + + +class BaseTunerLayer(ABC): + r""" + A tuner layer mixin that provides the common methods and attributes for all tuners. + + Args: + is_pluggable (`bool`, *optional*): + Whether the adapter layer can be plugged to any pytorch module + active_adapters (Union[List[`str`], `str`], *optional*): + The name of the active adapter. + """ + + # All names of layers that may contain adapter (trainable) weights + adapter_layer_names: tuple[str, ...] = () + # All names of other parameters that may contain adapter-related parameters + other_param_names: tuple[str, ...] = () + + # indicates whether all adapters should be disabled + _disable_adapters: bool = False + + # the currently active adapter(s) + _active_adapter: str | list[str] = "default" + + # List all merged adapters + merged_adapters: list[str] = [] + + def get_base_layer(self) -> nn.Module: + """ + (Recursively) get the base_layer. + + This is necessary for the case that the tuner layer wraps another tuner layer. + + """ + base_layer = self + while hasattr(base_layer, "base_layer"): + base_layer = base_layer.base_layer + return base_layer + + @property + def weight(self) -> torch.Tensor: + # This is required for some transformers code, e.g. for T5, weight is accessed as: + # self.wo.weight + # where "wo" is the adapter layer. + # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers + # /models/t5/modeling_t5.py#L292 + base_layer = self.get_base_layer() + if hasattr(base_layer, "qweight"): + # QuantLinear + weight = base_layer.qweight + else: + # Other layers + weight = base_layer.weight + return weight + + @property + def bias(self) -> torch.Tensor: + base_layer = self.get_base_layer() + return base_layer.bias + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + raise NotImplementedError + + def unmerge(self) -> None: + raise NotImplementedError + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + @property + def disable_adapters(self) -> bool: + # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method + return self._disable_adapters + + @property + def active_adapter(self) -> str | list[str]: + # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method + return self._active_adapter + + def _get_available_adapters(self) -> set[str]: + """Return all adapter names that can be found on this module.""" + adapters = set() + for layer_name in self.adapter_layer_names: + module = getattr(self, layer_name) + if not isinstance(module, (nn.ModuleDict, nn.ParameterDict)): + continue + adapters.update(set(module.keys())) + return adapters + + @property + def active_adapters(self): + if isinstance(self.active_adapter, str): + return [self.active_adapter] + # is already a list of str + return self.active_adapter + + def enable_adapters(self, enabled: bool) -> None: + """Toggle the enabling and disabling of adapters + + Takes care of setting the requires_grad flag for the adapter weights. + + Args: + enabled (bool): True to enable adapters, False to disable adapters + """ + if enabled: + self.set_adapter(self.active_adapters) + self._disable_adapters = False + else: + # disable grads on all adapter layers + for layer_name in self.adapter_layer_names: + layer = getattr(self, layer_name) + layer.requires_grad_(False) + self._disable_adapters = True + + def set_adapter(self, adapter_names: str | list[str], inference_mode: bool = False) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True) unless + inference_mode is True. + + Args: + adapter_name (`str` or `list[str]`): + The name(s) of the adapter(s) to set as active. + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + if isinstance(adapter_names, str): + adapter_names = [adapter_names] + + # Deactivate grads on the inactive adapter and activate grads on the active adapter (if not in inference mode) + for layer_name in self.adapter_layer_names: + module_dict = getattr(self, layer_name) + for key, layer in module_dict.items(): + if (key in adapter_names) and (not inference_mode): + # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may + # happen if a completely different adapter layer is being activated. + layer.requires_grad_(True) + else: + layer.requires_grad_(False) + + self._active_adapter = adapter_names + + def _all_available_adapter_names(self) -> list[str]: + """Return a sorted list of all available adapter names""" + adapter_names = set() + for name in self.adapter_layer_names + self.other_param_names: + # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter + # names + attr = getattr(self, name) + if hasattr(attr, "keys"): + adapter_names.update(attr.keys()) + return sorted(adapter_names) + + def delete_adapter(self, adapter_name: str) -> None: + """ + Delete an adapter from the layer + + This should be called on all adapter layers, or else we will get an inconsistent state. + + This method will also set a new active adapter if the deleted adapter was an active adapter. It is important + that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers. + + Args: + adapter_name (`str`): The name of the adapter to delete + + """ + for attr in self.adapter_layer_names + self.other_param_names: + if adapter_name in getattr(self, attr): + del getattr(self, attr)[adapter_name] + + if adapter_name in self.active_adapters: + # choose a new active adapter + active_adapters = self.active_adapters[:] + active_adapters.remove(adapter_name) + if active_adapters: + self.set_adapter(active_adapters) + else: + # no active adapters left, set a new default adapter + # here we get the list of all adapters existing adapter names and choose the first one + remaining_adapters = self._all_available_adapter_names() + if not remaining_adapters: + self.set_adapter([]) + else: + new_active_adapter = remaining_adapters[0] + warnings.warn( + f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to " + f"{new_active_adapter}." + ) + self.set_adapter(remaining_adapters[0]) + + def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None: + """ + Enable or disable gradients on the given adapter(s). + + Args: + adapter_name (`str` or `Sequence[str]`): + The name of the adapter(s) whose gradients should be enabled/disabled. + requires_grad (`bool`, *optional*) + Whether to enable (`True`, default) or disable (`False`). + """ + if isinstance(adapter_names, str): + adapter_names_set = {adapter_names} + else: + adapter_names_set = set(adapter_names) + + for layer_name in self.adapter_layer_names: + module_dict = getattr(self, layer_name) + for key, layer in module_dict.items(): + if key in adapter_names_set: + layer.requires_grad_(requires_grad) + + def _move_adapter_to_device_of_base_layer(self, adapter_name: str, device: Optional[torch.device] = None) -> None: + """ + Move the adapter of the given name to the device of the base layer. + """ + if device is None: + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.MultiheadAttention): + base_layer = base_layer.out_proj + # check weight and qweight (for GPTQ) + for weight_name in ("weight", "qweight"): + weight = getattr(base_layer, weight_name, None) + if weight is not None: + device = weight.device + dtype = weight.dtype + break + else: + # no break encountered: could not determine the device + return + + meta = torch.device("meta") + + # loop through all potential adapter layers and move them to the device of the base layer; be careful to only + # move this specific adapter to the device, as the other adapters could be on different devices + # see #1639 + for adapter_layer_name in self.adapter_layer_names + self.other_param_names: + adapter_layer = getattr(self, adapter_layer_name, None) + if not isinstance(adapter_layer, (nn.ModuleDict, nn.ParameterDict, BufferDict)): + continue + if adapter_name not in adapter_layer: + continue + if any(p.device == meta for p in adapter_layer.parameters()): + continue + + # TODO: weight is not necessarily defined here, leading to a NameError, fix that + if weight.dtype.is_floating_point or weight.dtype.is_complex: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device, dtype=dtype) + else: + adapter_layer[adapter_name] = adapter_layer[adapter_name].to(device) + + @overload + def _cast_input_dtype(self, x: None, dtype: torch.dtype) -> None: ... + + @overload + def _cast_input_dtype(self, x: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: ... + + def _cast_input_dtype(self, x, dtype: torch.dtype): + """ + Whether to cast the dtype of the input of the forward method. + + Usually, we want to enable this to align the input dtype with the dtype of the weight, but by setting + layer.cast_input_dtype=False, this can be disabled if necessary. + + Enabling or disabling can be managed via the peft.helpers.disable_lora_input_dtype_casting context manager. + """ + if x is None: # useful e.g. if x is the bias, which can be None + return None + + cast_input_dtype_enabled = getattr(self, "cast_input_dtype_enabled", True) + if (not cast_input_dtype_enabled) or (x.dtype == dtype): + return x + return x.to(dtype=dtype) + + +def _find_minimal_target_modules( + target_modules: list[str] | set[str], other_module_names: list[str] | set[str] +) -> set[str]: + """Find the minimal set of target modules that is sufficient to separate them from the other modules. + + Sometimes, a very large list of target_modules could be passed, which can slow down loading of adapters (e.g. when + loaded from diffusers). It may be possible to condense this list from hundreds of items to just a handful of + suffixes that are sufficient to distinguish the target modules from the other modules. + + Example: + ```py + >>> from peft.tuners.tuners_utils import _find_minimal_target_modules + + >>> target_modules = [f"model.decoder.layers.{i}.self_attn.q_proj" for i in range(100)] + >>> target_modules += [f"model.decoder.layers.{i}.self_attn.v_proj" for i in range(100)] + >>> other_module_names = [f"model.encoder.layers.{i}.self_attn.k_proj" for i in range(100)] + >>> _find_minimal_target_modules(target_modules, other_module_names) + {"q_proj", "v_proj"} + ``` + + Args: + target_modules (`list[str]` | `set[str]`): + The list of target modules. + other_module_names (`list[str]` | `set[str]`): + The list of other module names. They must not overlap with the target modules. + + Returns: + `set[str]`: + The minimal set of target modules that is sufficient to separate them from the other modules. + + Raises: + ValueError: + If `target_modules` is not a list or set of strings or if it contains an empty string. Also raises an error + if `target_modules` and `other_module_names` contain common elements. + """ + if isinstance(target_modules, str) or not target_modules: + raise ValueError("target_modules should be a list or set of strings.") + + target_modules = set(target_modules) + if "" in target_modules: + raise ValueError("target_modules should not contain an empty string.") + + other_module_names = set(other_module_names) + if not target_modules.isdisjoint(other_module_names): + msg = ( + "target_modules and other_module_names contain common elements, this should not happen, please " + "open a GitHub issue at https://github.com/huggingface/peft/issues with the code to reproduce this issue" + ) + raise ValueError(msg) + + # it is assumed that module name parts are separated by a "." + def generate_suffixes(s): + parts = s.split(".") + return [".".join(parts[i:]) for i in range(len(parts))][::-1] + + # Create a reverse lookup for other_module_names to quickly check suffix matches + other_module_suffixes = {suffix for item in other_module_names for suffix in generate_suffixes(item)} + + # Find all potential suffixes from target_modules + target_modules_suffix_map = {item: generate_suffixes(item) for item in target_modules} + + # Initialize a set for required suffixes + required_suffixes = set() + + # We sort the target_modules_suffix_map simply to get deterministic behavior, since sets have no order. In theory + # the order should not matter but in case there is a bug, it's better for the bug to be deterministic. + for item, suffixes in sorted(target_modules_suffix_map.items(), key=lambda tup: tup[1]): + # Go through target_modules items, shortest suffixes first + for suffix in suffixes: + # If the suffix is already in required_suffixes or matches other_module_names, skip it + if suffix in required_suffixes or suffix in other_module_suffixes: + continue + # Check if adding this suffix covers the item + if not any(item.endswith("." + req_suffix) for req_suffix in required_suffixes): + required_suffixes.add(suffix) + break + + if not required_suffixes: + return set(target_modules) + return required_suffixes + + +class _ExcludedModule: + """ + A private helper method used to represent excluded modules in the check_target_module_exists function. + """ + + def __bool__(self): + return False + + +def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None: + """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config. + + Args: + config (`PeftConfig`): + A config to match target modules from. + key (`str`): + A key to search any matches in config + + Returns: + `bool` | `re.Match[str]` | `None`: + True or re.Match object if key matches any target modules from config, False or None if no match found. + """ + if hasattr(config, "exclude_modules") and config.exclude_modules: + if isinstance(config.exclude_modules, str): + if re.fullmatch(config.exclude_modules, key): + return _ExcludedModule() + elif key in config.exclude_modules: + return _ExcludedModule() + elif any(key.endswith(f".{exclude_key}") for exclude_key in config.exclude_modules): + return _ExcludedModule() + + # Adapters should never match on modules to save modules as it is a guarantee for conflicts of behavior + # between `ModulesToSaveWrapper` internals and the potential adapter. + modules_to_save = getattr(config, "modules_to_save", None) + if modules_to_save: + if any(re.match(rf"(^|.*\.){m}($|\..*)", key) for m in modules_to_save): + return _ExcludedModule() + + if (config.target_modules is None) and (config.target_parameters is not None): + # this is allowed if config.target_parameters are specified + return False + + if isinstance(config.target_modules, str): + target_module_found = match_target_against_key(config.target_modules, key) + elif key in config.target_modules: + # this module is specified directly in target_modules + target_module_found = True + else: + target_module_found = any(key.endswith(f".{target_key}") for target_key in config.target_modules) + + layer_indexes = getattr(config, "layers_to_transform", None) + layers_pattern = getattr(config, "layers_pattern", None) + + is_using_layer_indexes = layer_indexes is not None and ( + len(layer_indexes) != 0 if isinstance(layer_indexes, list) else True + ) + if is_using_layer_indexes and target_module_found: + layer_index = None + # TODO: It's still unclear how empty layers_pattern (None, [], or "") should behave + # For now, empty layers_pattern means any layer pattern is ok + if layers_pattern is None or len(layers_pattern) == 0: + layer_index = re.match(r".*\.[^.]*\.(\d+)\.", key) + else: + layers_pattern = [layers_pattern] if isinstance(layers_pattern, str) else layers_pattern + for pattern in layers_pattern: + layer_index = re.match(rf".*\.{pattern}\.(\d+)\.", key) + if layer_index is not None: + break + + if layer_index is None: + target_module_found = False + else: + layer_index = int(layer_index.group(1)) + if isinstance(layer_indexes, int): + target_module_found = layer_index == layer_indexes + else: + target_module_found = layer_index in layer_indexes + + return target_module_found + + +def inspect_matched_modules(tuner: BaseTuner, adapter_name: str = "default") -> dict: + """ + A helper function to inspect the set of matched and unmatched modules for a PEFT model and the given adapter. + """ + config = tuner.peft_config[adapter_name] + key_list = [key for key, _ in tuner.model.named_modules()] + module_dict = {"matched": [], "unmatched": []} + for key in key_list: + if tuner._check_target_module_exists(config, key): + module_dict["matched"].append(key) + else: + module_dict["unmatched"].append(key) + return module_dict + + +def _maybe_include_all_linear_layers(peft_config: PeftConfig, model: nn.Module) -> PeftConfig: + """ + Helper function to update `target_modules` to all linear/Conv1D layers if provided as 'all-linear'. Adapted from + the QLoRA repository: https://github.com/artidoro/qlora/blob/main/qlora.py + """ + if not hasattr(peft_config, "target_modules"): + return peft_config + + # if `target_modules` is a string, convert to lower case and check if it matches "all-linear" + if not ( + isinstance(peft_config.target_modules, str) + and peft_config.target_modules.lower() == INCLUDE_LINEAR_LAYERS_SHORTHAND + ): + return peft_config + + linear_classes = (torch.nn.Linear, Conv1D) + linear_names = ("Linear",) + linear_module_names = set() + for name, module in model.named_modules(): + # match with all linear classes. + if isinstance(module, linear_classes): + linear_module_names.add(name) + elif isinstance(module, BaseTunerLayer) and any(n in type(module).__name__ for n in linear_names): + # If the model already has adapter layers applied, then the "linear" layer is actually an adapter layer, + # e.g. lora.Linear, and not nn.Linear. To target this layer, we don't want to check the layer type, as there + # are many possible layer types (one for each PEFT method) and the list would quickly get out of date. Thus + # we rely on the name of the layer class, which by convention is something like "Linear", "Linear4bit", + # "HqqLoraLinear", ... in PEFT. It's not pretty but should generally work. + # See 2390 + linear_module_names.add(name) + + # Try to remove linear layers that should not be targeted as best as possible. We have to rely on convention as + # there are no hard rules to detect these modules. + module_names_to_exclude = set() + if isinstance(model, PreTrainedModel): + output_emb = model.get_output_embeddings() + if output_emb is not None: + # ignore the last classification head for text generation models + last_module_name = [name for name, module in model.named_modules() if module is output_emb][0] + module_names_to_exclude.add(last_module_name) + elif peft_config.task_type == TaskType.SEQ_CLS: + # ignore classifier head for classification models (issue 2027) + # there is no fix name for the classifier head, so check the common ones + for name in SEQ_CLS_HEAD_NAMES: + cls_head = getattr(model, name, None) + if cls_head is not None: + last_module_name = [name for name, module in model.named_modules() if module is cls_head][0] + module_names_to_exclude.add(last_module_name) + break + + # we don't want nested LoRA layers, i.e. LoRA being applied to possibly existing lora_A, lora_B, etc. + # see 2390 + for prefix, module in model.named_modules(): + if isinstance(module, BaseTunerLayer): + for suffix, child in module.named_modules(): + if suffix: + module_names_to_exclude.add(f"{prefix}.{suffix}") + + linear_module_names -= module_names_to_exclude + peft_config.target_modules = linear_module_names + return peft_config + + +def check_adapters_to_merge(module: BaseTunerLayer, adapter_names: Optional[list[str]] = None) -> list[str]: + """ + Helper function to check which adapters should be merged. + + Only return those adapters that are not already merged. Give a warning if some or all of the adapters are already + merged. + + """ + if adapter_names is None: + adapter_names = module.active_adapters + if isinstance(adapter_names, str): + raise ValueError(f"adapter_names should be a list of strings, got {adapter_names!r}.") + + if module.merged: + merged_adapters = set(module.merged_adapters) + adapter_names = [name for name in adapter_names if name not in merged_adapters] + + if adapter_names: + warnings.warn( + f"Already following adapters were merged {','.join(module.merged_adapters)}. " + f"You are now additionally merging {','.join(adapter_names)}." + ) + else: + warnings.warn("All adapters are already merged, nothing to do.") + + return adapter_names + + +def clone_module(module: nn.Module, share_weights=False): + """Clone a module in a pytorch model. + + Clones a module of a model, optionally sharing all the parameters between the original and the clone. Simplifies + reusing a module when manipulating the architecture of a model. + """ + clone = copy.deepcopy(module) + + def _share_weights(src: nn.Module, dst: nn.Module): + for name, param in src.named_parameters(recurse=False): + dst.register_parameter(name, param) + + if share_weights: + for name, submodule in module.named_modules(): + _share_weights(submodule, clone.get_submodule(name)) + + return clone + + +def replicate_layers(model: nn.Module, layer_map: list[tuple[int, int]]): + """Replicate layers in a transfomer model with weight sharing. + + This function looks for a module list attribute at model[(.model)*].layers and replicates the layers in the module + list according to the layer map. For example the map `[[0, 4], [2, 5]]` will take the set of layers `[0, 1, 2, 3, + 4]` and replace them with a module list containing `[0, 1, 2, 3, 2, 3, 4]`. + """ + while hasattr(model, "model"): + model = model.model + # Some variants of the bert model nest the main model under the bert attribute. + if hasattr(model, "bert"): + model = model.bert + + model_type = None + layers: nn.ModuleList = None + if hasattr(model, "layers"): + model_type = "llama" + layers = model.layers + elif hasattr(model, "encoder") and hasattr(model.encoder, "layer"): + model_type = "bert" + layers = model.encoder.layer + elif hasattr(model, "h"): + model_type = "falcon" + layers = model.h + if not model_type or not isinstance(layers, nn.ModuleList): + raise ValueError( + "Could not locate the layers attribute in the model. " + "Expected Llama, Bert or Falcon compatible architectures." + ) + + new_layers = [] + for start, end in layer_map: + for i in range(start, end): + current_idx = len(new_layers) + new_layers.append(clone_module(layers[i], share_weights=True)) + # This is a hack needed to work around the layer_idx introduced in HF transformers. + for submodule in new_layers[-1].modules(): + if hasattr(submodule, "layer_idx"): + submodule.layer_idx = current_idx + layers = nn.ModuleList(new_layers) + if model_type == "llama": + model.layers = layers + elif model_type == "bert": + model.encoder.layer = layers + elif model_type == "falcon": + model.h = layers + else: + raise ValueError("Unexpected model type, need to handle post-processing of layers.") + if hasattr(model.config, "num_hidden_layers"): # Common to Llama, Bert, Falcon. + model.config.num_hidden_layers = len(new_layers) + + +############################### +# FUNCTIONS FOR functional.py # +############################### + + +def set_adapter( + model, + adapter_name: str | list[str], + inference_mode: bool = False, + layer_cls: type[BaseTunerLayer] = BaseTunerLayer, +) -> None: + """Set the active PEFT adapter(s) of the model. + + Active adapters are those adapters that participate in the forward pass. Use this function if you want to switch + between multiple PEFT adapters. + + Args: + model (`nn.Module`): + The model on which the adapter(s) should be set. + adapter_name (str, list[str]): + The name(s) of the adapter(s) to set as active + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + layer_cls (type, optional): + The class of the adapter layer. Defaults to `BaseTunerLayer`. + """ + _set_adapter(model, adapter_name, inference_mode=inference_mode) # auxiliary modules + for module in model.modules(): + if isinstance(module, layer_cls): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name, inference_mode=inference_mode) + + +def _delete_auxiliary_adapter(model, adapter_name: str, new_active_adapters: Optional[list[str]]) -> None: + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + module.delete_adapter(adapter_name, new_active_adapters=new_active_adapters) + + +def delete_adapter( + model: nn.Module, adapter_name: str, prefix: str, layer_cls: type[BaseTunerLayer] = BaseTunerLayer +) -> list[str] | None: + """ + Delete an existing PEFT adapter. + + Note: This function does not delete the PEFT config on the model, if there is one. It will also not completely + purge the PEFT layers if the last PEFT adapter is deleted. For this, consider using `model.unload()` if using a + PEFT model instance, or just reloading the base model. + + Args: + model (`nn.Module`): + The model from which the adapter should be deleted. + adapter_name (str): + The name of the adapter to be deleted. + prefix (str): + The prefix of the PEFT method, e.g. "lora_" for LoRA. + layer_cls (type, optional): + The class of the adapter layer. Defaults to `BaseTunerLayer`. + + Returns: + new_adapter (list[str] | None): + The name of remaining adapter(s) after deletion, or `None` if there are no active adapters left. Use this + to set the new active adapter of the model if necessary. + """ + key_list = [key for key, _ in model.named_modules() if prefix not in key] + new_adapter = None + + for key in key_list: + _, target, _ = _get_submodules(model, key) + if isinstance(target, layer_cls): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + _delete_auxiliary_adapter(model, adapter_name=adapter_name, new_active_adapters=new_adapter) + return new_adapter + + +def cast_adapter_dtype(model: nn.Module, adapter_name: str, autocast_adapter_dtype: bool = True) -> None: + """ + A helper method to cast the adapter weights to the correct dtype. + + Currently, this only upcasts float16 and bfloat16 to float32. + + Args: + adapter_name (`str`): + The adapter name. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. + """ + if not autocast_adapter_dtype: + return + + dtypes_to_convert_to_fp32 = {torch.float16, torch.bfloat16} + + for module in model.modules(): + if not isinstance(module, BaseTunerLayer): + continue + + for submodule in module.modules(): + if not isinstance(submodule, (nn.ModuleDict, nn.ParameterDict, BufferDict)): + continue + + if adapter_name not in submodule: + continue + + if isinstance(submodule[adapter_name], nn.Parameter): + if submodule[adapter_name].dtype in dtypes_to_convert_to_fp32: + submodule[adapter_name].data = submodule[adapter_name].data.to(torch.float32) + continue + + if isinstance(submodule[adapter_name], torch.Tensor): # e.g. from a BufferDict + if submodule[adapter_name].dtype in dtypes_to_convert_to_fp32: + submodule[adapter_name] = submodule[adapter_name].to(torch.float32) + continue + + for param in submodule[adapter_name].parameters(): + if param.dtype in dtypes_to_convert_to_fp32: + param.data = param.data.to(torch.float32) + + +def set_requires_grad(model, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None: + """ + Enable or disable gradients on the given adapter(s). + + Args: + model (`nn.Module`): + The model from which the adapter should be deleted. + adapter_name (`str` or `Sequence[str]`): + The name of the adapter(s) whose gradients should be enabled/disabled. + requires_grad (`bool`, *optional*) + Whether to enable (`True`, default) or disable (`False`). + """ + for module in model.modules(): + if isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)): + module.set_requires_grad(adapter_names=adapter_names, requires_grad=requires_grad) diff --git a/peft/src/peft/tuners/vblora/__init__.py b/peft/src/peft/tuners/vblora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8e71a08461e8b7cb2fb5513a3bf908a4a98c0747 --- /dev/null +++ b/peft/src/peft/tuners/vblora/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import VBLoRAConfig +from .layer import Linear, VBLoRALayer +from .model import VBLoRAModel + + +__all__ = ["Linear", "VBLoRAConfig", "VBLoRALayer", "VBLoRAModel"] + +register_peft_method(name="vblora", config_cls=VBLoRAConfig, model_cls=VBLoRAModel) diff --git a/peft/src/peft/tuners/vblora/config.py b/peft/src/peft/tuners/vblora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..0a969b0875d883cf8967d5601f72ee3bb3684ee3 --- /dev/null +++ b/peft/src/peft/tuners/vblora/config.py @@ -0,0 +1,196 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class VBLoRAConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`VBLoRAConfig`]. + + Paper: https://huggingface.co/papers/2405.15179 + + Args: + r (`int`): + The rank of incremental matrices. + num_vectors (`int`): + Number of vectors in the vector bank. Use higher values when the model size increases. + vector_length (`int`): + The length of the vectors in the vector bank. The length of the vectors should be divisible by the hidden + dimension of the model. + topk (`int`): + The K value for top-K selection. A larger value of K increases the size of the saved model. In practice, + setting K=2 typically provides the best performance and parameter efficiency. For more details, refer to + the discussion in the paper. + target_modules (`Union[List[str], str]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + exclude_modules (`Optional[Union[List[str], str]]`): + The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. + When passing a list of strings, either an exact match will be performed or it is checked if the name of the + module ends with any of the passed strings. + save_only_topk_weights (`bool`): + Whether to only save the topk weights. Setting `save_only_topk_weights = True` significantly reduces + storage space. However, models saved in this mode can be used for merging or inference only, not for + resuming training. + vblora_dropout (`float`): + The dropout probability for VBLoRA layers. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for VBLoRA. Can be 'none', 'all' or 'vblora_only'. If 'all' or 'vblora_only', the corresponding + biases will be updated during training. Be aware that this means that, even when disabling the adapters, + the model will not produce the same output as the base model would have without adaptation. + modules_to_save (`List[str]`): + List of modules apart from VBLoRA layers to be set as trainable and saved in the final checkpoint. + init_vector_bank_bound (`float`): + The vector bank is initialized with a uniform distribution between -init_vector_bank_bound and + init_vector_bank_bound. Avoid initializing the vector bank with all zeros to prevent zero gradients. A + small value, such as 0.02, is typically effective. Initializing with a large value may cause training + instability. + init_logits_std (`float`): + The logits are initialized with a normal distribution with a standard deviation of init_logits_std. Default + is 0.1. + layers_to_transform (`Union[List[int],int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + """ + + r: int = field(default=4, metadata={"help": "The rank of incremental matrices."}) + num_vectors: int = field( + default=256, + metadata={"help": "Number of vectors in the vector bank. Use higher values when the model size increases."}, + ) + vector_length: int = field( + default=256, + metadata={ + "help": "The length of the vectors in the vector bank. The length of the vectors should be divisible by " + "the hidden dimension of the model." + }, + ) + topk: int = field( + default=2, + metadata={ + "help": "The K value for top-K selection. A larger value of K increases the size of the saved model. " + "In practice, setting K=2 typically provides the best performance and parameter efficiency. " + "For more details, refer to the discussion in the paper." + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with LoRA." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually." + ) + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex expression of the module names to exclude from VBLoRA."}, + ) + save_only_topk_weights: bool = field( + default=False, + metadata={ + "help": ( + "Whether to only save the topk weights. Setting `save_only_topk_weights = True` significantly reduces " + "storage space. However, models saved in this mode can be used for merging or inference only, not for " + "resuming training." + ) + }, + ) + vblora_dropout: float = field(default=0.0, metadata={"help": "VBLoRA dropout"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field(default="none", metadata={"help": "Bias type for VBLoRA. Can be 'none', 'all' or 'vblora_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from VBLoRA layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + init_vector_bank_bound: float = field( + default=0.02, + metadata={ + "help": ( + "The vector bank is initialized with a uniform distribution between -init_vector_bank_bound and" + " init_vector_bank_bound. Avoid initializing the vector bank with all zeros to prevent zero gradients." + " A small value, such as 0.02, is typically effective. Initializing with a large value may cause" + " training instability." + ), + }, + ) + init_logits_std: float = field( + default=0.1, + metadata={ + "help": ( + "The logits are initialized with a normal distribution with a standard deviation of init_logits_std. " + "Default value 0.1 typically works well." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + "This only works when target_modules is a list of str." + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.VBLORA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") diff --git a/peft/src/peft/tuners/vblora/layer.py b/peft/src/peft/tuners/vblora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..ea2f0cca77b78e59fe840adc08b3c3f1a2444d3b --- /dev/null +++ b/peft/src/peft/tuners/vblora/layer.py @@ -0,0 +1,251 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose + + +class VBLoRALayer(BaseTunerLayer): + # List all names of layers that may contain adapter weights + adapter_layer_names = ("vblora_logits_A", "vblora_logits_B", "vblora_vector_bank") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.topk = {} + self.vblora_dropout = nn.ModuleDict({}) + + # For storing vector scale + self.vblora_logits_A = nn.ParameterDict({}) + self.vblora_logits_B = nn.ParameterDict({}) + + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + def update_layer( + self, + adapter_name: str, + vblora_vector_bank, + r: int, + topk: int, + num_vectors: int, + vector_length: float, + vblora_dropout: float = 0.0, + init_logits_std: float = 0.01, + inference_mode: bool = False, + **kwargs, + ): + if r <= 0: + raise ValueError(f"`r` {r} should be a positive integer value") + if topk <= 0: + raise ValueError(f"`topk` {topk} should be a positive integer value") + + if self.in_features % vector_length != 0: + raise ValueError(f"`in_features` {self.in_features} must be divisible by `vector_length` {vector_length}") + if self.out_features % vector_length != 0: + raise ValueError( + f"`out_features` {self.out_features} must be divisible by `vector_length` {vector_length}" + ) + + self.r[adapter_name] = r + self.topk[adapter_name] = topk + if vblora_dropout > 0.0: + vblora_dropout_layer = nn.Dropout(p=vblora_dropout) + else: + vblora_dropout_layer = nn.Identity() + self.vblora_dropout.update(nn.ModuleDict({adapter_name: vblora_dropout_layer})) + self.vblora_logits_A[adapter_name] = nn.Parameter( + torch.zeros(r, self.in_features // vector_length, num_vectors), requires_grad=True + ) + self.vblora_logits_B[adapter_name] = nn.Parameter( + torch.zeros(self.out_features // vector_length, r, num_vectors), requires_grad=True + ) + self.vblora_vector_bank = vblora_vector_bank + self.reset_vblora_logits(adapter_name, init_logits_std) + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_vblora_logits(self, adapter_name, init_logits_std): + if adapter_name in self.vblora_logits_A.keys(): + with torch.no_grad(): + nn.init.normal_(self.vblora_logits_A[adapter_name], 0, init_logits_std) + nn.init.normal_(self.vblora_logits_B[adapter_name], 0, init_logits_std) + + +class Linear(nn.Linear, VBLoRALayer): + # VBLoRA implemented in a dense layer + def __init__( + self, + base_layer, + vblora_vector_bank, + adapter_name: str, + r: int, + num_vectors: int, + vector_length: int, + topk: int = 2, + vblora_dropout: float = 0.0, + init_logits_std: float = 0.01, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + **kwargs, + ) -> None: + # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called + super(nn.Linear, self).__init__() + VBLoRALayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer( + adapter_name, vblora_vector_bank, r, topk, num_vectors, vector_length, vblora_dropout, init_logits_std + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.vblora_logits_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.vblora_logits_A.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def _get_low_rank_matrix(self, logits: torch.tensor, vblora_vector_bank, topk) -> torch.Tensor: + top_k_logits, indices = logits.topk(topk, dim=-1) + topk_weights = F.softmax(top_k_logits, dim=-1) + return (topk_weights.unsqueeze(-1) * vblora_vector_bank[indices]).sum(-2) + + def _get_lora_matrices(self, adapter, cast_to_fp32=False) -> tuple[torch.Tensor, torch.Tensor]: + vblora_logits_A = self.vblora_logits_A[adapter] + vblora_logits_B = self.vblora_logits_B[adapter] + + # Check for infinity values when training. If found, training was likely resumed from a `save_only_topk_weights` model. + if self.training and vblora_logits_A[0, 0].isinf().any(): + raise RuntimeError( + "Found infinity values in VB-LoRA logits. Ensure training was not resumed from a `save_only_topk_weights` model." + ) + + vblora_vector_bank = self.vblora_vector_bank[adapter].to(vblora_logits_A.device) + topk = self.topk[adapter] + # In case users wants to merge the adapter weights that are in + # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16. + if cast_to_fp32: + vblora_logits_A = vblora_logits_A.float() + vblora_logits_B = vblora_logits_B.float() + vblora_vector_bank = vblora_vector_bank.float() + + # A: (rank, in_tile, vector_length) -> (rank, in_tile x vector_length) + A = self._get_low_rank_matrix(vblora_logits_A, vblora_vector_bank, topk).reshape(vblora_logits_A.shape[0], -1) + # B: (out_tile, rank, vector_length) -> (out_tile, vector_length, rank) -> (out_tile x vector_length, rank) + B = ( + self._get_low_rank_matrix(vblora_logits_B, vblora_vector_bank, topk) + .transpose(1, 2) + .reshape(-1, vblora_logits_B.shape[1]) + ) + return A, B + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.vblora_logits_A[adapter].device + dtype = self.vblora_logits_A[adapter].dtype + cast_to_fp32 = device.type == "cpu" and dtype == torch.float16 + A, B = self._get_lora_matrices(adapter, cast_to_fp32) + output_tensor = transpose(B @ A, self.fan_in_fan_out) + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.vblora_logits_A.keys(): + continue + A, B = self._get_lora_matrices(active_adapter) + x = x.to(self.vblora_vector_bank[active_adapter].dtype) + dropout = self.vblora_dropout[active_adapter] + result = result + F.linear(F.linear(dropout(x), A), B) + result = result.to(previous_dtype) + return result diff --git a/peft/src/peft/tuners/vblora/model.py b/peft/src/peft/tuners/vblora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..745ce61ffedbd58e9c87a80ee887f403dc9a1941 --- /dev/null +++ b/peft/src/peft/tuners/vblora/model.py @@ -0,0 +1,209 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings + +import torch +import torch.nn as nn +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING + +from .config import VBLoRAConfig +from .layer import Linear, VBLoRALayer + + +class VBLoRAModel(BaseTuner): + """ + Creates VBLoRA model from a pretrained transformers model. + + The method is described in detail in https://huggingface.co/papers/2405.15179. + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`VBLoRAConfig`]): The configuration of the VBLoRA model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The VBLoRA model. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import VBLoRAConfig, get_peft_model + + >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + >>> config = VBLoRAConfig( + ... task_type="SEQ_CLS", + ... r=4, + ... target_modules=["fc1", "fc2", "k_proj", "out_proj", "q_proj", "v_proj"], + ... num_vectors=60, + ... vector_length=256, + ... save_only_topk_weights=True, + ... ) + >>> model = get_peft_model(base_model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`VBLoRAConfig`]): The configuration of the VBLoRAConfig model. + """ + + prefix: str = "vblora_" + tuner_layer_cls = VBLoRALayer + target_module_mapping = TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING + + def _init_vblora_vector_bank(self, config: VBLoRAConfig, adapter_name: str) -> None: + vblora_vector_bank = torch.zeros(config.num_vectors, config.vector_length) + torch.nn.init.uniform_(vblora_vector_bank, -config.init_vector_bank_bound, config.init_vector_bank_bound) + self.vblora_vector_bank[adapter_name] = vblora_vector_bank + + def _pre_injection_hook(self, model: nn.Module, config: VBLoRAConfig, adapter_name: str) -> None: + self.vblora_vector_bank = nn.ParameterDict({}) + + def _create_and_replace( + self, + vblora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "fan_in_fan_out": vblora_config.fan_in_fan_out, + "bias": bias, + } + self._init_vblora_vector_bank(vblora_config, adapter_name) + # TODO: add quantization support + + if isinstance(target, Linear): + target.update_layer( + adapter_name=adapter_name, + vblora_vector_bank=self.vblora_vector_bank, + r=vblora_config.r, + topk=vblora_config.topk, + num_vectors=vblora_config.num_vectors, + vector_length=vblora_config.vector_length, + vblora_dropout=vblora_config.vblora_dropout, + init_logits_std=vblora_config.init_logits_std, + ) + else: + new_module = self._create_new_module( + vblora_config=vblora_config, + vblora_vector_bank=self.vblora_vector_bank, + adapter_name=adapter_name, + target=target, + **kwargs, + ) + if adapter_name not in self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(vblora_config, vblora_vector_bank, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = vblora_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = vblora_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + new_module = Linear( + base_layer=target, + vblora_vector_bank=vblora_vector_bank, + adapter_name=adapter_name, + r=vblora_config.r, + num_vectors=vblora_config.num_vectors, + vector_length=vblora_config.vector_length, + topk=vblora_config.topk, + vblora_dropout=vblora_config.vblora_dropout, + init_logits_std=vblora_config.init_logits_std, + **kwargs, + ) + + return new_module + + def get_nb_savable_parameters(self, adapter="default") -> tuple[int, int]: + r""" + Returns the number of savable VB-LoRA parameters and other savable parameters. + """ + logits_params = 0 + vector_bank_params = 0 + other_params = 0 + for name, param in self.named_parameters(): + if "vblora_logits" in name: + logits_params += param.numel() + elif "vblora_vector_bank" in name: + vector_bank_params += param.numel() + elif param.requires_grad: + other_params += param.numel() + if self.peft_config[adapter].save_only_topk_weights: + num_vectors = self.peft_config[adapter].num_vectors + factor = 1 # factor to count float32-equivalent parameters + if num_vectors < 2**8: + factor = 0.25 + elif num_vectors < 2**15: + factor = 0.5 + elif num_vectors < 2**31: + factor = 1 + else: + factor = 2 + topk_weight_params = ( + logits_params / self.peft_config[adapter].num_vectors * (self.peft_config[adapter].topk - 1) + ) + topk_indices_params = ( + logits_params / self.peft_config[adapter].num_vectors * self.peft_config[adapter].topk * factor + ) + vblora_params = int(vector_bank_params + topk_weight_params + topk_indices_params) + else: + vblora_params = vector_bank_params + logits_params + return vblora_params, other_params + + def print_savable_parameters(self) -> None: + r""" + Prints the number of savable VB-LoRA parameters and total savable parameters. + """ + vblora_params, other_params = self.get_nb_savable_parameters() + print( + f"VB-LoRA params to-be-saved (float32-equivalent): {vblora_params:,d} " + f"|| total params to-be-saved: {(vblora_params + other_params):,d}" + ) diff --git a/peft/src/peft/tuners/vera/__init__.py b/peft/src/peft/tuners/vera/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..25c4a96619524bfdcd41a8f7df331533ba370782 --- /dev/null +++ b/peft/src/peft/tuners/vera/__init__.py @@ -0,0 +1,40 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.utils import register_peft_method + +from .config import VeraConfig +from .layer import Linear, VeraLayer +from .model import VeraModel + + +__all__ = ["Linear", "VeraConfig", "VeraLayer", "VeraModel"] + + +register_peft_method(name="vera", config_cls=VeraConfig, model_cls=VeraModel, prefix="vera_lambda_") + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/peft/src/peft/tuners/vera/bnb.py b/peft/src/peft/tuners/vera/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..71d20e4b1163ab9cc93b7b33c2aa022ddd3eb11f --- /dev/null +++ b/peft/src/peft/tuners/vera/bnb.py @@ -0,0 +1,411 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Optional + +import bitsandbytes as bnb +import torch + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import check_adapters_to_merge +from peft.utils.integrations import dequantize_bnb_weight +from peft.utils.other import transpose + +from .layer import VeraLayer + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, VeraLayer): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + vera_A, + vera_B, + r: int = 0, + vera_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_weights: bool = True, + d_initial: float = 0.1, + **kwargs, + ) -> None: + super().__init__() + VeraLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + vera_A, + vera_B, + r, + vera_dropout=vera_dropout, + init_weights=init_weights, + d_initial=d_initial, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + if self.merged: + warnings.warn( + f"Already following adapters were merged {','.join(self.merged_adapters)}. " + f"You are now additionally merging {','.join(self.active_adapters)}." + ) + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + for active_adapter in adapter_names: + if active_adapter not in self.vera_lambda_d.keys(): + continue + + warnings.warn( + "Merge vera module to 8-bit linear may get different generations due to rounding errors." + ) + vera_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + output = dequantize_bnb_weight(weight, state) + w_data = output.to(vera_data.dtype).to(vera_data.device) + vera_data + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.vera_lambda_d.keys(): + continue + warnings.warn( + "Unmerge vera module to 8-bit linear may get different generations due to rounding errors." + ) + vera_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + output = dequantize_bnb_weight(weight, state=state) + + w_data = output.to(vera_data.dtype).to(vera_data.device) - vera_data + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): The name of the adapter for which the delta weight should be computed. + + Returns: + torch.Tensor: The computed delta weight for the VeRA adapter. + + Note: + This method implements the VeRA-specific weight update. Unlike LoRA, VeRA uses shared projection + matrices (vera_A and vera_B) across all layers, along with per-layer trainable parameters (lambda_d and + lambda_b). + """ + # Retrieve shared projection matrices + vera_A = self.vera_A[adapter] + vera_B = self.vera_B[adapter] + + # Retrieve per-layer trainable parameters + device = vera_B.device + dtype = vera_B.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + lambda_d = self.vera_lambda_d[adapter] + lambda_b = self.vera_lambda_b[adapter] + + if cast_to_fp32: + vera_A = vera_A.float() + vera_B = vera_B.float() + lambda_d = lambda_d.float() + lambda_b = lambda_b.float() + + sliced_A = vera_A[:, : self.in_features].to(lambda_d.device) + sliced_B = vera_B[: self.out_features, :].to(lambda_d.device) + lambda_b = lambda_b.unsqueeze(-1) + lambda_d = lambda_d.unsqueeze(-1) + + # VeRA-specific computation: + # 1. Apply lambda_d to the input projection (vera_A) + # 2. Apply lambda_b to the output projection (vera_B) + # 3. Compute the outer product of the scaled projections + output_tensor = transpose((lambda_b * sliced_B) @ (lambda_d * sliced_A), self.fan_in_fan_out) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + """ + Perform the forward pass using the VeRA adapter. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: Output tensor after applying the VeRA adaptation. + + Note: + This method implements the VeRA-specific forward pass. It applies the shared projections (vera_A and + vera_B) along with the per-layer trainable parameters (lambda_d and lambda_b) to compute the adapter + output. + """ + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.vera_lambda_d.keys(): + continue + + lambda_d = self.vera_lambda_d[active_adapter] + lambda_b = self.vera_lambda_b[active_adapter] + + vera_A = self.vera_A[active_adapter] + vera_B = self.vera_B[active_adapter] + + dropout = self.vera_dropout[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = lambda_d.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + + sliced_A = vera_A[:, : self.in_features].to(x.device) + sliced_B = vera_B[: self.out_features, :].to(x.device) + + x_temp = dropout(x.to(lambda_d.dtype)) + + adapter_output = lambda_b * torch.nn.functional.linear( + lambda_d * torch.nn.functional.linear(x_temp, sliced_A), sliced_B + ) + + if requires_conversion: + adapter_output = adapter_output.to(expected_dtype) + + result = result + adapter_output + + # Ensure the output tensor has the same dtype as the input tensor + return result.to(x.dtype) + + def __repr__(self) -> str: + rep = super().__repr__() + return "vera." + rep + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, VeraLayer): + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + vera_A, + vera_B, + r: int = 0, + vera_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_weights: bool = True, + d_initial: float = 0.1, + **kwargs, + ) -> None: + super().__init__() + VeraLayer.__init__(self, base_layer) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + vera_A, + vera_B, + r, + vera_dropout=vera_dropout, + init_weights=init_weights, + d_initial=d_initial, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + if self.merged: + warnings.warn( + f"Already following adapters were merged {','.join(self.merged_adapters)}. " + f"You are now additionally merging {','.join(self.active_adapters)}." + ) + + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + return + + for active_adapter in adapter_names: + if active_adapter not in self.vera_lambda_d.keys(): + continue + + warnings.warn( + "Merge vera module to 4-bit linear may get different generations due to rounding errors." + ) + vera_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + # torch.compile can introduce attributes preceded by '_', remove them + kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")} + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + vera_data + + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.vera_lambda_d.keys(): + continue + warnings.warn( + "Unmerge vera module to 4-bit linear may get different generations due to rounding errors." + ) + vera_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - vera_data + + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + + def get_delta_weight(self, adapter) -> torch.Tensor: + vera_A = self.vera_A[adapter] + vera_B = self.vera_B[adapter] + + device = vera_B.device + dtype = vera_B.dtype + + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + lambda_d = self.vera_lambda_d[adapter] + lambda_b = self.vera_lambda_b[adapter] + + if cast_to_fp32: + vera_A = vera_A.float() + vera_B = vera_B.float() + lambda_d = lambda_d.float() + lambda_b = lambda_b.float() + + sliced_A = vera_A[:, : self.in_features].to(lambda_d.device) + sliced_B = vera_B[: self.out_features, :].to(lambda_d.device) + lambda_b = lambda_b.unsqueeze(-1) + lambda_d = lambda_d.unsqueeze(-1) + + output_tensor = transpose((lambda_b * sliced_B) @ (lambda_d * sliced_A), self.fan_in_fan_out) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result = result.clone() + for active_adapter in self.active_adapters: + if active_adapter not in self.vera_lambda_d.keys(): + continue + + lambda_d = self.vera_lambda_d[active_adapter] + lambda_b = self.vera_lambda_b[active_adapter] + + vera_A = self.vera_A[active_adapter] + vera_B = self.vera_B[active_adapter] + + dropout = self.vera_dropout[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = lambda_d.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + + sliced_A = vera_A[:, : self.in_features].to(x.device) + sliced_B = vera_B[: self.out_features, :].to(x.device) + + x_temp = dropout(x.to(lambda_d.dtype)) + + adapter_output = lambda_b * torch.nn.functional.linear( + lambda_d * torch.nn.functional.linear(x_temp, sliced_A), sliced_B + ) + + if requires_conversion: + adapter_output = adapter_output.to(expected_dtype) + + result = result + adapter_output + + # Ensure the output tensor has the same dtype as the input tensor + return result.to(x.dtype) + + def __repr__(self) -> str: + rep = super().__repr__() + return "vera." + rep diff --git a/peft/src/peft/tuners/vera/config.py b/peft/src/peft/tuners/vera/config.py new file mode 100644 index 0000000000000000000000000000000000000000..df880b7af0df6b92717e339f131474340e2ade3c --- /dev/null +++ b/peft/src/peft/tuners/vera/config.py @@ -0,0 +1,162 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + + +@dataclass +class VeraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`VeraModel`]. + + Paper: https://huggingface.co/papers/2310.11454. + + Args: + r (`int`, *optional*, defaults to `256`): + VeRA parameter dimension ("rank"). Choose higher values than LoRA ranks here, since VeRA uses far fewer + parameters than LoRA (see Table 1). + target_modules (`Union[List[str], str]`): + The names of the modules to apply Vera to. Only linear layers are supported. + projection_prng_key (`int`): + Vera PRNG init key. Used for initialising vera_A and vera_B for new models or when loading a checkpoint + that did not include these projections. Defaults to `0`. + save_projection (`bool`): + Whether to save the vera_A / vera_B projections in the state dict alongside per layer lambda_b / lambda_d + weights. This will increase the size of the checkpoint, but guarantee that we can reload the checkpoint on + all system configurations. Defaults to `True`. + vera_dropout (`float`): + The dropout probability for Vera layers. + d_initial (`float`, *optional*, defaults to `0.1`): + Initial init value for `vera_lambda_d` vector used when initializing the VeRA parameters. Small values + (<=0.1) are recommended (see Table 6c in the paper). + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for Vera. Can be 'none', 'all' or 'vera_only'. If 'all' or 'vera_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + modules_to_save (`List[str]`): + List of modules apart from Vera layers to be set as trainable and saved in the final checkpoint. + init_weights (`bool`): + Whether to initialize the weights of the Vera layers with their default initialization. Don't change this + setting, except if you know exactly what you're doing. + layers_to_transform (`Union[List[int],int]`): + The layer indexes to transform, if this argument is specified, it will apply the Vera transformations on + the layer indexes that are specified in this list. If a single integer is passed, it will apply the Vera + transformations on the layer at this index. + layers_pattern (`Optional[Union[List[str], str]]`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the + `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. + """ + + r: int = field(default=256, metadata={"help": "Vera attention dimension"}) + + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with Vera." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Only linear layers are supported." + ) + }, + ) + projection_prng_key: int = field( + default=0, + metadata={ + "help": ( + "Vera PRNG init key. Used for initialising vera_A and vera_B for new models or when loading a " + "checkpoint that did not include these projections." + ) + }, + ) + save_projection: bool = field( + default=True, + metadata={ + "help": ( + "Whether to save the vera_A / vera_B projections in the state dict alongside per layer lambda_b / " + "lambda_d weights. This will increase the size of the checkpoint, but guarantee that we can reload " + "the checkpoint on all system configurations." + ) + }, + ) + vera_dropout: float = field(default=0.0, metadata={"help": "Vera dropout"}) + d_initial: float = field(default=0.1, metadata={"help": "Initial init value for d vector."}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: str = field(default="none", metadata={"help": "Bias type for Vera. Can be 'none', 'all' or 'vera_only'"}) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules apart from Vera layers to be set as trainable and saved in the final checkpoint. For" + " example, in Sequence Classification or Token Classification tasks, the final layer" + " `classifier/score` are randomly initialized and as such need to be trainable and saved." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the Vera layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers" + " indexes that are specified inside this list. If a single integer is passed, PEFT will transform only" + " the layer at this index." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer " + "pattern is not in the common layers pattern. This should target the `nn.ModuleList` of the " + "model, which is often called `'layers'` or `'h'`." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.VERA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + if not self.save_projection: + warnings.warn( + "Specified to not save vera_A and vera_B within the state dictionary, instead they will be restored " + "using the PRNG key store in `config.projection_prng_key`. Consider setting `config.save_projection` " + "to `True` to guarantee restoring the checkpoint correctly on all system configurations." + ) diff --git a/peft/src/peft/tuners/vera/layer.py b/peft/src/peft/tuners/vera/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..7559eea49597c6a222beb8ec65c220ed39fe3f58 --- /dev/null +++ b/peft/src/peft/tuners/vera/layer.py @@ -0,0 +1,291 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose + +from .._buffer_dict import BufferDict + + +class VeraLayer(BaseTunerLayer): + # List all names of layers that may contain adapter weights + adapter_layer_names = ("vera_lambda_b", "vera_lambda_d") + other_param_names = ("vera_A", "vera_B") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.vera_dropout = nn.ModuleDict({}) + + # For storing vector scale + self.vera_lambda_b = nn.ParameterDict({}) + self.vera_lambda_d = nn.ParameterDict({}) + + # Stores a reference to the vera_A/B BufferDict. + # Set to `None` otherwise to avoid computation with random weights + self.vera_A: Optional[BufferDict] = None + self.vera_B: Optional[BufferDict] = None + + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + + self.in_features = in_features + self.out_features = out_features + self.kwargs = kwargs + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + def update_layer( + self, + adapter_name, + vera_A: BufferDict, + vera_B: BufferDict, + r, + vera_dropout, + init_weights, + d_initial: float = 0.1, + inference_mode: bool = False, + **kwargs, + ): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + self.r[adapter_name] = r + if vera_dropout > 0.0: + vera_dropout_layer = nn.Dropout(p=vera_dropout) + else: + vera_dropout_layer = nn.Identity() + + self.vera_dropout.update(nn.ModuleDict({adapter_name: vera_dropout_layer})) + # Actual trainable parameters + self.vera_lambda_b[adapter_name] = nn.Parameter(torch.ones(self.out_features), requires_grad=True) + self.vera_lambda_d[adapter_name] = nn.Parameter(torch.randn(r), requires_grad=True) + + # non trainable references to vera_A/B buffers + self.vera_A = vera_A + self.vera_B = vera_B + if adapter_name not in vera_A: + # This means that this is not the first VeRA adapter. We have to add an entry in the dict for this adapter. + if len(self.vera_A) < 1: + raise ValueError( + "The `vera_A` and `vera_B` buffers are empty. This should not happen. Please report this issue." + ) + # we can take any of the existing adapter's parameters, as they should all be identical + vera_A_param = list(self.vera_A.values())[0] + vera_B_param = list(self.vera_B.values())[0] + + error_tmpl = ( + "{} has a size of {} but {} or greater is required; this probably happened because an additional VeRA " + "adapter was added after the first one with incompatible shapes." + ) + # check input size + if vera_A_param.shape[1] < self.in_features: + raise ValueError(error_tmpl.format("vera_A", vera_A_param.shape[1], self.in_features)) + # check output size + if vera_B_param.shape[0] < self.out_features: + raise ValueError(error_tmpl.format("vera_B", vera_B_param.shape[0], self.out_features)) + # check r + error_tmpl = ( + "{} has a size of {} but {} or greater is required; this probably happened because an additional VeRA " + "adapter with a lower rank was added after the first one; loading the adapters " + "in reverse order may solve this." + ) + if vera_A_param.shape[0] < self.r[adapter_name]: + raise ValueError(error_tmpl.format("vera_A", vera_A_param.shape[0], self.r[adapter_name])) + if vera_B_param.shape[1] < self.r[adapter_name]: + raise ValueError(error_tmpl.format("vera_B", vera_B_param.shape[1], self.r[adapter_name])) + + self.vera_A[adapter_name] = vera_A_param + self.vera_B[adapter_name] = vera_B_param + + if init_weights: + self.reset_vera_parameters(adapter_name, d_initial=d_initial) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters, inference_mode=inference_mode) + + def reset_vera_parameters(self, adapter_name, d_initial: float = 0.1): + if adapter_name in self.vera_lambda_d.keys(): + with torch.no_grad(): + nn.init.zeros_(self.vera_lambda_d[adapter_name]).fill_(d_initial) + nn.init.zeros_(self.vera_lambda_b[adapter_name]) + + +class Linear(nn.Linear, VeraLayer): + # Vera implemented in a dense layer + def __init__( + self, + base_layer, + vera_A: BufferDict, + vera_B: BufferDict, + adapter_name: str, + r: int = 0, + vera_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_weights: bool = True, + d_initial: float = 0.1, + **kwargs, + ) -> None: + # this gets the init from nn.Linear's super perspective, i.e. nn.Module.__init__, which should always be called + super(nn.Linear, self).__init__() + VeraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer(adapter_name, vera_A, vera_B, r, vera_dropout, init_weights, d_initial=d_initial) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.vera_lambda_d.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.vera_lambda_d.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + vera_A = self.vera_A[adapter] + vera_B = self.vera_B[adapter] + + device = vera_B.device + dtype = vera_B.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + lambda_d = self.vera_lambda_d[adapter] + lambda_b = self.vera_lambda_b[adapter] + + if cast_to_fp32: + vera_A = vera_A.float() + vera_B = vera_B.float() + lambda_d = lambda_d.float() + lambda_b = lambda_b.float() + + sliced_A = vera_A[:, : self.in_features].to(lambda_d.device) + sliced_B = vera_B[: self.out_features, :].to(lambda_d.device) + lambda_b = lambda_b.unsqueeze(-1) + lambda_d = lambda_d.unsqueeze(-1) + output_tensor = transpose((lambda_b * sliced_B) @ (lambda_d * sliced_A), self.fan_in_fan_out) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.vera_lambda_d.keys(): + continue + + lambda_d = self.vera_lambda_d[active_adapter] + lambda_b = self.vera_lambda_b[active_adapter] + + vera_A = self.vera_A[active_adapter] + vera_B = self.vera_B[active_adapter] + + # As adapted layers may have different shapes and VeRA contains a single shared pair of A and B matrices, + # we initialize these matrices with the largest required size for each dimension. + # During the forward pass, required submatrices are sliced out from the shared vera_A and vera_B. + sliced_A = vera_A[:, : self.in_features].to(x.device) + sliced_B = vera_B[: self.out_features, :].to(x.device) + + dropout = self.vera_dropout[active_adapter] + x = x.to(lambda_d.dtype) + result = result + lambda_b * F.linear(lambda_d * F.linear(dropout(x), sliced_A), sliced_B) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "vera." + rep diff --git a/peft/src/peft/tuners/vera/model.py b/peft/src/peft/tuners/vera/model.py new file mode 100644 index 0000000000000000000000000000000000000000..f85d84e259a008bdc9dd709e810f881815450e7d --- /dev/null +++ b/peft/src/peft/tuners/vera/model.py @@ -0,0 +1,294 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math +import warnings +from typing import Union + +import torch +import torch.nn as nn +from torch.nn.init import _calculate_correct_fan +from transformers.pytorch_utils import Conv1D + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING, +) + +from .._buffer_dict import BufferDict +from ..tuners_utils import _maybe_include_all_linear_layers +from .config import VeraConfig +from .layer import Linear, VeraLayer + + +def _kaiming_init( + tensor_or_shape: Union[torch.Tensor, tuple[int, ...]], + generator: torch.Generator, +) -> torch.Tensor: + """ + Kaiming Uniform Initialisation adapted to accept a `torch.Generator` object for PRNG. + + Args: + tensor_or_shape (`Union[torch.Tensor, tuple[int, ...]]`): + Tensor to initialise, or shape of new tensor to create and then initialise. + generator: (`torch.Generator`): + Generator object that manages the state of the PRNG algorithm in use. + + Returns: + `torch.Tensor`: The initialised tensor. + """ + if isinstance(tensor_or_shape, tuple): + tensor = torch.empty(tensor_or_shape) + else: + tensor = tensor_or_shape + fan = _calculate_correct_fan(tensor, "fan_in") + gain = math.sqrt(2) + std = gain / math.sqrt(fan) + bound = math.sqrt(3.0) * std + + with torch.no_grad(): + return tensor.uniform_(-bound, bound, generator=generator) + + +class VeraModel(BaseTuner): + """ + Creates Vector-based Random Matrix Adaptation (Vera) model from a pretrained transformers model. + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`VeraConfig`]): The configuration of the Vera model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The Vera model. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import VeraConfig, get_peft_model + + >>> base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + >>> config = VeraConfig(r=128) + >>> model = get_peft_model(base_model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`VeraConfig`]): The configuration of the Vera model. + """ + + prefix: str = "vera_lambda_" + tuner_layer_cls = VeraLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING + + def _find_dim(self, config) -> tuple[int, int]: + """ + Finds the largest input and output dimensions across linear layers that have been wrapped with VeRA. + + This will be used for determining the size of the shared vera_A and vera_B matrices. + """ + model_config = self.get_model_config(self.model) + + peft_config = self._prepare_adapter_config(config, model_config) + peft_config = _maybe_include_all_linear_layers(peft_config, self.model) + + largest_shape = None + for key, module in self.model.named_modules(): + if not self._check_target_module_exists(peft_config, key): + continue + + if isinstance(module, nn.Linear): + module_shape = module.out_features, module.in_features + elif isinstance(module, Conv1D): + module_shape = module.weight.ds_shape if hasattr(module.weight, "ds_shape") else module.weight.shape + module_shape = module_shape[::-1] + else: + continue + + if largest_shape is None: + largest_shape = module_shape + continue + + if module_shape != largest_shape: + largest_shape = tuple(max(a, b) for a, b in zip(largest_shape, module_shape)) + + if largest_shape is None: + msg = "No layers types compatible with VeRA were found. Please check `peft_config.target_modules`." + raise ValueError(msg) + + return largest_shape + + def _init_vera_A_vera_B(self, config: VeraConfig, adapter_name: str) -> None: + linear_out_dim, linear_in_dim = self._find_dim(config) + + # use of persistent to exclude vera_A and vera_B from the state dict if we choose not to save them. + self.vera_A = BufferDict({}, persistent=config.save_projection) + self.vera_B = BufferDict({}, persistent=config.save_projection) + + # deterministic init of vera_A and vera_B if we know the key + generator = torch.Generator(device="cpu").manual_seed(config.projection_prng_key) + vera_A = _kaiming_init((config.r, linear_in_dim), generator=generator) + vera_B = _kaiming_init((linear_out_dim, config.r), generator=generator) + + self.vera_A[adapter_name] = vera_A + self.vera_B[adapter_name] = vera_B + + def _pre_injection_hook(self, model: nn.Module, config: VeraConfig, adapter_name: str) -> None: + self._init_vera_A_vera_B(config, adapter_name) + + def _check_new_adapter_config(self, config: VeraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + super()._check_new_adapter_config(config) + + for existing_config in self.peft_config.values(): + if existing_config is config: + # skip the current config + continue + + if existing_config.projection_prng_key != config.projection_prng_key: + raise ValueError( + f"Vera PRNG initialisation key must be the same for all adapters. Got {config.projection_prng_key=} but " + f"previous config had {existing_config.projection_prng_key}." + ) + + save_project_unique_values = sorted({config.save_projection for config in self.peft_config.values()}) + if len(save_project_unique_values) > 1: + raise ValueError( + "VeRA projection weights must be saved for all adapters or none, but got multiple different values: " + f"{save_project_unique_values}" + ) + + def _create_and_replace( + self, + vera_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + r = vera_config.r + bias = hasattr(target, "bias") and target.bias is not None + kwargs = { + "r": r, + "vera_dropout": vera_config.vera_dropout, + "fan_in_fan_out": vera_config.fan_in_fan_out, + "init_weights": vera_config.init_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + kwargs["bias"] = bias + + if isinstance(target, Linear): + target.update_layer( + adapter_name, + self.vera_A, + self.vera_B, + r, + vera_config.vera_dropout, + vera_config.init_weights, + d_initial=vera_config.d_initial, + ) + else: + new_module = self._create_new_module(vera_config, self.vera_A, self.vera_B, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(vera_config, vera_A, vera_B, adapter_name, target, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import Linear8bitLt + + if is_bnb_4bit_available(): + from .bnb import Linear4bit + + bias = kwargs.pop("bias", False) + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + return Linear8bitLt(target, adapter_name, vera_A, vera_B, **eightbit_kwargs) + elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + return Linear4bit(target, adapter_name, vera_A, vera_B, **fourbit_kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = vera_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = vera_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `transformers.pytorch_utils.Conv1D`." + ) + new_module = Linear( + target, + vera_A, + vera_B, + adapter_name, + bias=bias, + d_initial=vera_config.d_initial, + **kwargs, + ) + + return new_module diff --git a/peft/src/peft/tuners/waveft/__init__.py b/peft/src/peft/tuners/waveft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f0e6fbc7e40eeccc393c662c09c81c032760dbfe --- /dev/null +++ b/peft/src/peft/tuners/waveft/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import WaveFTConfig +from .layer import WaveFTLayer, WaveFTLinear +from .model import WaveFTModel + + +__all__ = ["WaveFTConfig", "WaveFTLayer", "WaveFTLinear", "WaveFTModel"] + +register_peft_method(name="waveft", model_cls=WaveFTModel, config_cls=WaveFTConfig) diff --git a/peft/src/peft/tuners/waveft/config.py b/peft/src/peft/tuners/waveft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f2233c94e4ec583f52cb3cd4193e93de25f8ea3f --- /dev/null +++ b/peft/src/peft/tuners/waveft/config.py @@ -0,0 +1,265 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft.config import PeftConfig +from peft.utils import PeftType + +from .constants import WAVELET_REDUCTIONS + + +@dataclass +class WaveFTConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`WaveFTModel`]. It is used to define the + parameters for Wavelet-based Fine-Tuning (WaveFT), an approach that leverages the sparsity of wavelet transforms + for parameter-efficient fine-tuning of pretrained models. + + Args: + n_frequency (`int`): + Number of learnable wavelet coefficients for the Discrete Wavelet Transform (DWT). 'n_frequency' is an + integer that is greater than 0 and less than or equal to the total number of elements in the original + weight matrix (d_out * d_in). This parameter directly controls the number of trainable parameters for each + adapted layer. A higher 'n_frequency' generally leads to better performance but also increases GPU memory + usage, with a minor impact on training speed. + scaling (`float`): + The scaling factor applied to the reconstructed delta W matrix. This is a crucial hyperparameter, analogous + to `lora_alpha` in LoRA. It can be tuned during hyperparameter search. Our default value for SDXL + personalization is 25. + wavelet_family (`str`): + The wavelet family (e.g., 'db1', 'sym2', 'coif1') to use for the DWT and Inverse DWT (IDWT). Defaults to + 'db1' (Haar wavelet). Different wavelet families have varying filter lengths which affect the training time + substantially + use_idwt (`bool`): + Set to False for efficient adaptation. Whether to use the Inverse Discrete Wavelet Transform (IDWT) to + reconstruct the delta weights from the learned wavelet coefficients. If `True` (default), the IDWT is + applied. If `False`, the learned coefficients are directly used to form a sparse delta weight matrix, which + is faster but performs worse for the SDXL personalization task. + random_loc_seed (`int`): + Seed for determining the random locations of the `n_frequency` learnable wavelet coefficients within the + full wavelet coefficient matrix. + target_modules (`Union[list[str],str]`): + List of module names or a regex expression identifying the modules to be adapted with WaveFT. For example, + `['q_proj', 'v_proj']` or `'.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'`. Currently, only linear + layers (`torch.nn.Linear`) are supported. + exclude_modules (`Optional[Union[List[str], str]]`): + List of module names or a regex expression for modules to exclude from WaveFT adaptation. + fan_in_fan_out (`bool`): + Set to `True` if the weights of the layer to be replaced are stored in `(fan_in, fan_out)` format. Default + is `False`. + bias (`str`): + Bias type for WaveFT. Can be 'none', 'all', or 'waveft_only'. ('fourier_only' was likely a typo and has + been corrected to 'waveft_only' if it implies bias only on adapted parameters) If 'waveft_only', biases are + added only to the WaveFT components. If 'all', biases are added to both base and WaveFT components. If + 'none', no new biases are added. + modules_to_save (`list[str]`): + List of modules, in addition to WaveFT layers, that should be marked as trainable and saved in the final + checkpoint. Useful for layers like classifiers in sequence or token classification tasks that are randomly + initialized and need training. + layers_to_transform (`Union[list[int],int]`): + Specific layer indices to transform. If provided, PEFT will only adapt layers at these indices. If a single + integer is given, only that layer is transformed. + layers_pattern (`Optional[Union[List[str], str]]`): + Pattern for layer names, used if `layers_to_transform` is specified and the layer pattern is not standard + (e.g., not 'layers' or 'h'). This should target the `nn.ModuleList` attribute in the model. + n_frequency_pattern (`dict`): + A dictionary mapping layer names (or regex) to specific `n_frequency` values, overriding the global + `n_frequency`. Example: `{"model.decoder.layers.0.encoder_attn.k_proj": 1000}`. + init_weights (`bool`): + Initialization strategy for the learnable wavelet coefficients (spectrum). If `True` (default), + coefficients are initialized to zeros. If `False`, coefficients are initialized from a standard normal + distribution scaled by a small factor. + proportional_parameters (`bool`): + If `True`, `n_frequency` is allocated proportionally to each layer's `input_dim * output_dim`. Default is + `False`. Note: This option is included for experimental thoroughness to allow researchers to reproduce + paper results, rather than for practical utility, as no beneficial scenarios have been identified. + """ + + n_frequency: int = field( + default=2592, # Default value might need adjustment based on common use cases or paper findings + metadata={ + "help": ( + "Number of learnable wavelet coefficients for the Discrete Wavelet Transform (DWT). " + "'n_frequency' is an integer that is greater than 0 and less than or equal to the " + "total number of elements in the original weight matrix (d_out * d_in). " + "This parameter directly controls the number of trainable parameters for each adapted layer. " + "A higher 'n_frequency' generally leads to better performance but also increases " + "GPU memory usage, with a minor impact on training speed." + ) + }, + ) + scaling: float = field( + default=25.0, # Default value seems low based on typical examples, might need adjustment + metadata={ + "help": ( + "The scaling factor applied to the reconstructed delta W matrix. This is a crucial " + "hyperparameter, analogous to 'lora_alpha' in LoRA. It can be tuned during hyperparameter " + "search. Default value for SDXL personalization is 25. " + ) + }, + ) + wavelet_family: str = field( + default="db1", + metadata={ + "help": ( + "The wavelet family (e.g., 'db1', 'sym2', 'coif1') to use for the DWT and Inverse DWT (IDWT). " + "Defaults to 'db1' (Haar wavelet). Different wavelet families have varying filter lengths " + "which affect the training time substantially. Size differences are handled automatically " + "if use_idwt is True." + ) + }, + ) + use_idwt: bool = field( + default=True, + metadata={ + "help": ( + "Set to False for efficient adaptation. " + "Whether to use the Inverse Discrete Wavelet Transform (IDWT) to reconstruct the delta " + "weights from the learned wavelet coefficients. If True (default), the IDWT is applied. " + "If False, the learned coefficients are directly used to form a sparse delta weight matrix, " + "which is faster but performs worse for the SDXL personalization task." + ) + }, + ) + random_loc_seed: int = field( + default=777, + metadata={ + "help": ( + "Seed for determining the random locations of the 'n_frequency' learnable wavelet " + "coefficients within the full wavelet coefficient matrix." + ) + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={ + "help": ( + "Set to True if the weights of the layer to be replaced are stored in (fan_in, fan_out) " + "format. Default is False." + ) + }, + ) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or a regex expression identifying the modules to be adapted with WaveFT. " + "For example, ['q_proj', 'v_proj'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'. " + "Currently, only linear layers (torch.nn.Linear) are supported." + ) + }, + ) + exclude_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={"help": "List of module names or regex for modules to exclude from WaveFT adaptation."}, + ) + bias: str = field( + default="none", + metadata={ + "help": ( + "Bias type for WaveFT. Can be 'none', 'all', or 'waveft_only'. " + "If 'waveft_only', biases are added only to the WaveFT components. " + "If 'all', biases are added to both base and WaveFT components. " + "If 'none', no new biases are added." + ) + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": ( + "List of modules, in addition to WaveFT layers, that should be marked as trainable " + "and saved in the final checkpoint. Useful for layers like classifiers in sequence " + "or token classification tasks that are randomly initialized and need training." + ) + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": ( + "Specific layer indices to transform. If provided, PEFT will only adapt layers at these " + "indices. If a single integer is given, only that layer is transformed." + ) + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "Pattern for layer names, used if `layers_to_transform` is specified and the layer " + "pattern is not standard (e.g., not 'layers' or 'h'). This should target the " + "`nn.ModuleList` attribute in the model." + ) + }, + ) + n_frequency_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "A dictionary mapping layer names (or regex) to specific `n_frequency` values, " + 'overriding the global `n_frequency`. Example: {"model.decoder.layers.0.encoder_attn.k_proj": 1000}.' + ) + }, + ) + proportional_parameters: bool = field( + default=False, + metadata={ + "help": ( + "If True, 'n_frequency' is allocated proportionally to each layer's " + "input_dim * output_dim. Default is False. Note: This option is included " + "for experimental thoroughness to allow researchers to reproduce paper results, " + "rather than for practical utility, as no beneficial scenarios have been identified." + ) + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Initialization strategy for the learnable wavelet coefficients (spectrum). " + "If True (default), coefficients are initialized to zeros. " + "If False, coefficients are initialized from a standard normal distribution scaled by a small factor." + ) + }, + ) + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.WAVEFT + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.exclude_modules = ( + set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + # check for layers_to_transform and layers_pattern + if self.layers_pattern and not self.layers_to_transform: + raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + + if self.wavelet_family not in WAVELET_REDUCTIONS: + raise ValueError( + f"Wavelet family {self.wavelet_family} not supported. Supported wavelet families are: {list(WAVELET_REDUCTIONS.keys())}" + ) diff --git a/peft/src/peft/tuners/waveft/constants.py b/peft/src/peft/tuners/waveft/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..b1559f4fa5bb13039ab9687b643690d988513eb9 --- /dev/null +++ b/peft/src/peft/tuners/waveft/constants.py @@ -0,0 +1,96 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Dimensional reduction amounts for different wavelet families during wavelet transforms Each tuple (rows, cols) +represents the reduction in matrix dimensions that occurs when applying wavelet decomposition/reconstruction due to +boundary effects and filter sizes. These values are used to pre-pad matrices before wavelet processing to ensure the +reconstructed matrix maintains the original target dimensions. +""" + +WAVELET_REDUCTIONS = { + "db1": (0, 0), + "db2": (2, 2), + "db3": (4, 4), + "db4": (6, 6), + "db5": (8, 8), + "db6": (10, 10), + "db7": (12, 12), + "db8": (14, 14), + "db9": (16, 16), + "db10": (18, 18), + "db11": (20, 20), + "db12": (22, 22), + "db13": (24, 24), + "db14": (26, 26), + "db15": (28, 28), + "db16": (30, 30), + "db17": (32, 32), + "db18": (34, 34), + "db19": (36, 36), + "db20": (38, 38), + "db21": (40, 40), + "db22": (42, 42), + "db23": (44, 44), + "db24": (46, 46), + "db25": (48, 48), + "db26": (50, 50), + "db27": (52, 52), + "db28": (54, 54), + "db29": (56, 56), + "db30": (58, 58), + "db31": (60, 60), + "db32": (62, 62), + "db33": (64, 64), + "db34": (66, 66), + "db35": (68, 68), + "db36": (70, 70), + "db37": (72, 72), + "db38": (74, 74), + "sym2": (2, 2), + "sym3": (4, 4), + "sym4": (6, 6), + "sym5": (8, 8), + "sym6": (10, 10), + "sym7": (12, 12), + "sym8": (14, 14), + "sym9": (16, 16), + "sym10": (18, 18), + "sym11": (20, 20), + "sym12": (22, 22), + "sym13": (24, 24), + "sym14": (26, 26), + "sym15": (28, 28), + "sym16": (30, 30), + "sym17": (32, 32), + "sym18": (34, 34), + "sym19": (36, 36), + "sym20": (38, 38), + "coif1": (4, 4), + "coif2": (10, 10), + "coif3": (16, 16), + "coif4": (22, 22), + "coif5": (28, 28), + "coif6": (34, 34), + "coif7": (40, 40), + "coif8": (46, 46), + "coif9": (52, 52), + "coif10": (58, 58), + "coif11": (64, 64), + "coif12": (70, 70), + "coif13": (76, 76), + "coif14": (82, 82), + "coif15": (88, 88), + "coif16": (94, 94), + "coif17": (100, 100), +} diff --git a/peft/src/peft/tuners/waveft/layer.py b/peft/src/peft/tuners/waveft/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..a17f3ffba365384664c2df04888a0e83f9305af8 --- /dev/null +++ b/peft/src/peft/tuners/waveft/layer.py @@ -0,0 +1,291 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge + +from .constants import WAVELET_REDUCTIONS +from .waverec2d import waverec2d + + +class WaveFTLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("waveft_spectrum",) + # All names of other parameters that may contain adapter-related parameters + other_param_names = ( + "waveft_n_frequency", + "waveft_scaling", + "waveft_random_loc_seed", + "waveft_wavelet_family", + "waveft_indices", + "waveft_use_idwt", + ) + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.waveft_n_frequency = {} + self.waveft_scaling = {} + self.waveft_spectrum = nn.ParameterDict({}) + self.waveft_wavelet_family = {} + self.waveft_indices = {} + self.waveft_random_loc_seed = {} + self.waveft_use_idwt = {} + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + self.in_features, self.out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, Conv1D): + self.in_features, self.out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + def update_layer( + self, adapter_name, n_frequency, scaling, init_weights, random_loc_seed, wavelet_family="db1", use_idwt=True + ): + if n_frequency <= 0: + raise ValueError(f"`n_frequency` should be a positive integer value but the value passed is {n_frequency}") + if n_frequency > self.in_features * self.out_features: + raise ValueError( + f"`n_frequency` should be less than or equal to the product of the input and output dimensions " + f"but the value passed is {n_frequency} and the product is {self.in_features * self.out_features}" + ) + + self.waveft_n_frequency[adapter_name] = n_frequency + self.waveft_random_loc_seed[adapter_name] = random_loc_seed + self.waveft_wavelet_family[adapter_name] = wavelet_family + self.waveft_use_idwt[adapter_name] = use_idwt + + # Get the expanded dimensions based on wavelet family + reduction_rows, reduction_cols = WAVELET_REDUCTIONS[wavelet_family] + + # Generate random indices within the original dimensions + # We handle padding separately in get_delta_weight + generator = torch.Generator().manual_seed(self.waveft_random_loc_seed[adapter_name]) + indices = torch.randperm(self.out_features * self.in_features, generator=generator)[:n_frequency] + + # Convert to row, col format for the original dimensions + self.waveft_indices[adapter_name] = torch.stack( + [indices // self.in_features, indices % self.in_features], dim=0 + ) + + self.waveft_scaling[adapter_name] = scaling + + # Actual trainable parameters + # Initialize based on init_weights + if init_weights: + # Initialize with zeros later using reset_wave_parameters + self.waveft_spectrum[adapter_name] = nn.Parameter(torch.empty(n_frequency), requires_grad=True) + self.reset_wave_parameters(adapter_name) # Initialize to zeros now + else: + # Initialize with randn scaled by a small std dev to prevent explosion + std_dev = 0.01 # Using a small std dev for initial random weights + self.waveft_spectrum[adapter_name] = nn.Parameter(torch.randn(n_frequency) * std_dev, requires_grad=True) + + self._move_adapter_to_device_of_base_layer(adapter_name) + self.set_adapter(self.active_adapters) + + @torch.no_grad() + def reset_wave_parameters(self, adapter_name): + if adapter_name in self.waveft_spectrum.keys(): + nn.init.zeros_(self.waveft_spectrum[adapter_name]) + + def get_delta_weight(self, adapter) -> torch.Tensor: + spectrum = self.waveft_spectrum[adapter] + indices = self.waveft_indices[adapter].to(spectrum.device) + wavelet_family = self.waveft_wavelet_family[adapter] + + # Choose whether to use IDWT or direct spectrum based on adapter setting + if self.waveft_use_idwt[adapter]: + reduction_rows, reduction_cols = WAVELET_REDUCTIONS[wavelet_family] + + # Create a padded spectrum matrix with additional rows and columns + # to account for the reduction during wavelet reconstruction + padded_out_features = self.out_features + reduction_rows + padded_in_features = self.in_features + reduction_cols + + # Make dimensions even if needed for wavelet processing + if padded_out_features % 2 != 0: + padded_out_features += 1 + if padded_in_features % 2 != 0: + padded_in_features += 1 + + # Create the padded dense spectrum matrix + dense_spectrum = torch.zeros( + padded_out_features, padded_in_features, device=spectrum.device, dtype=spectrum.dtype + ) + + # Calculate padding offsets to center the original data in the padded matrix + row_offset = (padded_out_features - self.out_features) // 2 + col_offset = (padded_in_features - self.in_features) // 2 + + # Adjust indices to account for padding offsets + padded_indices = indices.clone() + padded_indices[0, :] += row_offset + padded_indices[1, :] += col_offset + + # Place spectrum values in the padded matrix + # Filter out any indices that would be out of bounds + valid_mask = (padded_indices[0, :] < padded_out_features) & (padded_indices[1, :] < padded_in_features) + valid_indices = padded_indices[:, valid_mask] + valid_spectrum = spectrum[valid_mask] + + # Set the spectrum values in the padded matrix + dense_spectrum[valid_indices[0, :], valid_indices[1, :]] = valid_spectrum + + # Split into four sub-bands + H, W = dense_spectrum.shape + H2, W2 = H // 2, W // 2 + cA = dense_spectrum[:H2, :W2] # top-left + cH = dense_spectrum[:H2, W2:] # top-right + cV = dense_spectrum[H2:, :W2] # bottom-left + cD = dense_spectrum[H2:, W2:] # bottom-right + + # Construct wavelet-coefficient tuple + coeffs = (cA, (cH, cV, cD)) + + # Reconstruct with the specified wavelet family + delta_weight = waverec2d(coeffs, wavelet_family) * self.waveft_scaling[adapter] + + # Ensure the delta weight has exactly the correct dimensions + if delta_weight.shape[0] != self.out_features or delta_weight.shape[1] != self.in_features: + # Calculate where to start slicing to get a centered crop + start_row = (delta_weight.shape[0] - self.out_features) // 2 + start_col = (delta_weight.shape[1] - self.in_features) // 2 + + # Slice to the exact output size needed + delta_weight = delta_weight[ + start_row : start_row + self.out_features, start_col : start_col + self.in_features + ] + else: + # Simple direct use of spectrum without IDWT + dense_spectrum = torch.zeros( + self.out_features, self.in_features, device=spectrum.device, dtype=spectrum.dtype + ) + dense_spectrum[indices[0, :], indices[1, :]] = spectrum + delta_weight = dense_spectrum * self.waveft_scaling[adapter] + + return delta_weight + + +class WaveFTLinear(nn.Module, WaveFTLayer): + # WaveFT implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + n_frequency: int = 1000, + scaling: float = 150.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + init_weights: Union[bool, str] = False, + random_loc_seed: int = 777, + wavelet_family: str = "db1", + use_idwt: bool = True, + **kwargs, + ) -> None: + super().__init__() + WaveFTLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer(adapter_name, n_frequency, scaling, init_weights, random_loc_seed, wavelet_family, use_idwt) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.waveft_spectrum.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.waveft_spectrum.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + return super().get_delta_weight(adapter) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.waveft_spectrum.keys(): + continue + + delta_w = self.get_delta_weight(active_adapter) + x = self._cast_input_dtype(x, delta_w.dtype) + result = result + F.linear(x, delta_w) + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "waveft." + rep diff --git a/peft/src/peft/tuners/waveft/model.py b/peft/src/peft/tuners/waveft/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3ecc8ac6b0d8407ed378517e91de9c35f05b9be9 --- /dev/null +++ b/peft/src/peft/tuners/waveft/model.py @@ -0,0 +1,195 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings + +import torch +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft.utils import ( + TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING, +) +from peft.utils.other import get_pattern_key + +from .layer import WaveFTLayer, WaveFTLinear + + +class WaveFTModel(BaseTuner): + prefix: str = "waveft_" + tuner_layer_cls: type[BaseTunerLayer] = WaveFTLayer + target_module_mapping = TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING + + def _calculate_proportional_parameters(self, model: torch.nn.Module, waveft_config): + """Calculate proportional parameter allocation for all target modules.""" + target_modules_info = [] + for name, module in model.named_modules(): + if check_target_module_exists(waveft_config, name): + # Handle case where module is already wrapped with WaveFT + if isinstance(module, WaveFTLayer): + # Use the base layer for dimension calculations + base_module = module.base_layer + if isinstance(base_module, torch.nn.Linear): + input_dim, output_dim = base_module.in_features, base_module.out_features + elif isinstance(base_module, Conv1D): + input_dim, output_dim = base_module.weight.shape[1], base_module.weight.shape[0] + else: + continue + elif isinstance(module, torch.nn.Linear): + input_dim, output_dim = module.in_features, module.out_features + elif isinstance(module, Conv1D): + input_dim, output_dim = module.weight.shape[1], module.weight.shape[0] + else: + continue + target_modules_info.append((name, input_dim, output_dim)) + + if not target_modules_info: + raise ValueError("No target modules found for proportional parameter allocation.") + + total_sum = sum(input_dim * output_dim for (_, input_dim, output_dim) in target_modules_info) + num_layers = len(target_modules_info) + total_budget = waveft_config.n_frequency * num_layers + + n_frequency_dict = {} + for name, input_dim, output_dim in target_modules_info: + layer_ratio = (input_dim * output_dim) / total_sum + n_freq = round(layer_ratio * total_budget) + n_frequency_dict[name] = n_freq + + return n_frequency_dict + + def _create_and_replace( + self, + waveft_config, + adapter_name, + target, + target_name, + parent, + current_key, + **optional_kwargs, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + # Calculate proportional parameters if needed (only once per adapter) + if waveft_config.proportional_parameters: + if not hasattr(self, "_proportional_params_cache"): + self._proportional_params_cache = {} + if adapter_name not in self._proportional_params_cache: + n_frequency_dict = self._calculate_proportional_parameters(self.model, waveft_config) + self._proportional_params_cache[adapter_name] = n_frequency_dict + + # Determine n_frequency: Priority order: + # 1. From proportional parameter cache (if proportional_parameters=True) + # 2. From optional_kwargs (if passed directly) + # 3. From n_frequency_pattern in config + # 4. From default n_frequency in config + n_frequency = None + if ( + waveft_config.proportional_parameters + and hasattr(self, "_proportional_params_cache") + and adapter_name in self._proportional_params_cache + ): + n_frequency = self._proportional_params_cache[adapter_name].get(current_key) + + if n_frequency is None and "n_frequency" in optional_kwargs: + n_frequency = optional_kwargs["n_frequency"] + + if n_frequency is None: + pattern_keys = list(waveft_config.n_frequency_pattern.keys()) + target_name_key = get_pattern_key(pattern_keys, current_key) + n_frequency = waveft_config.n_frequency_pattern.get(target_name_key, waveft_config.n_frequency) + + # Determine wavelet_family + wavelet_family = None + if "wavelet_family" in optional_kwargs: + wavelet_family = optional_kwargs["wavelet_family"] + if wavelet_family is None: + wavelet_family = waveft_config.wavelet_family + + scaling = waveft_config.scaling + random_loc_seed = waveft_config.random_loc_seed + bias = hasattr(target, "bias") and target.bias is not None + # Prepare kwargs for module creation/update + kwargs = { + "n_frequency": n_frequency, + "scaling": scaling, + "fan_in_fan_out": waveft_config.fan_in_fan_out, + "init_weights": waveft_config.init_weights, + "random_loc_seed": waveft_config.random_loc_seed, + "wavelet_family": wavelet_family, # Use determined wavelet family + } + kwargs["bias"] = bias + + if isinstance(target, WaveFTLayer): + target.update_layer( + adapter_name, + n_frequency, + scaling, + waveft_config.init_weights, + random_loc_seed, + wavelet_family=wavelet_family, # Pass determined wavelet family + use_idwt=waveft_config.use_idwt, + ) + else: + new_module = self._create_new_module(waveft_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _create_new_module(waveft_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = waveft_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + kwargs["is_target_conv_1d_layer"] = True + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = waveft_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + kwargs["wavelet_family"] = waveft_config.wavelet_family + kwargs["use_idwt"] = waveft_config.use_idwt + new_module = WaveFTLinear(target, adapter_name, **kwargs) + + return new_module + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + super().delete_adapter(adapter_name) + # Clean up proportional parameters cache + if hasattr(self, "_proportional_params_cache") and adapter_name in self._proportional_params_cache: + del self._proportional_params_cache[adapter_name] diff --git a/peft/src/peft/tuners/waveft/wavelet.py b/peft/src/peft/tuners/waveft/wavelet.py new file mode 100644 index 0000000000000000000000000000000000000000..c66acd85f65570652f18d47d5bed12eb3a174ea5 --- /dev/null +++ b/peft/src/peft/tuners/waveft/wavelet.py @@ -0,0 +1,513 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Minimal wavelet implementation extracted from PyWavelets + +This code contains portions derived from PyWavelets: Copyright (c) 2006-2012 Filip Wasilewski +Copyright (c) 2012- The PyWavelets Developers + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit +persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Original source: https://github.com/PyWavelets/pywt +""" + +import math +from collections.abc import Sequence + + +class Wavelet: + """ + Minimal wavelet class that implements the most commonly used wavelets. + + Supports: + - Daubechies wavelets: db1-db10, haar + - Symlets: sym2-sym10 + - Coiflets: coif1-coif5 + """ + + def __init__(self, name: str): + """ + Initialize a wavelet by name. + + Args: + name: Wavelet name (e.g., 'db4', 'haar', 'sym5', 'coif2') + """ + self.name = name.lower() + self._compute_filters() + + def _compute_filters(self): + """Compute the four filter banks from the base coefficients.""" + if self.name == "haar": + # Haar is the same as db1 + base_coeffs = _WAVELET_COEFFS["db1"] + elif self.name in _WAVELET_COEFFS: + base_coeffs = _WAVELET_COEFFS[self.name] + else: + raise ValueError(f"Unknown wavelet name '{self.name}'. Available wavelets: {list(_WAVELET_COEFFS.keys())}") + + # Determine if this is a coiflet (needs sqrt(2) scaling) + scale_factor = math.sqrt(2) if self.name.startswith("coif") else 1.0 + + # Apply scaling to base coefficients + scaled_coeffs = [c * scale_factor for c in base_coeffs] + + # Compute the four filter banks following PyWavelets convention + # rec_lo = scaled base coefficients + self._rec_lo = scaled_coeffs[:] + + # dec_lo = rec_lo reversed + self._dec_lo = scaled_coeffs[::-1] + + # rec_hi = alternating signs of dec_lo + self._rec_hi = [(-1) ** i * scaled_coeffs[len(scaled_coeffs) - 1 - i] for i in range(len(scaled_coeffs))] + + # dec_hi = alternating signs of rec_lo + self._dec_hi = [(-1) ** (len(scaled_coeffs) - 1 - i) * scaled_coeffs[i] for i in range(len(scaled_coeffs))] + + @property + def dec_lo(self) -> Sequence[float]: + """Lowpass decomposition filter.""" + return self._dec_lo + + @property + def dec_hi(self) -> Sequence[float]: + """Highpass decomposition filter.""" + return self._dec_hi + + @property + def rec_lo(self) -> Sequence[float]: + """Lowpass reconstruction filter.""" + return self._rec_lo + + @property + def rec_hi(self) -> Sequence[float]: + """Highpass reconstruction filter.""" + return self._rec_hi + + @property + def dec_len(self) -> int: + """Decomposition filters length.""" + return len(self._dec_lo) + + @property + def rec_len(self) -> int: + """Reconstruction filters length.""" + return len(self._rec_lo) + + @property + def filter_bank(self) -> tuple[Sequence[float], Sequence[float], Sequence[float], Sequence[float]]: + """Tuple of all four filter banks (dec_lo, dec_hi, rec_lo, rec_hi).""" + return (self.dec_lo, self.dec_hi, self.rec_lo, self.rec_hi) + + def __len__(self) -> int: + """Return the length of the decomposition filters.""" + return self.dec_len + + def __repr__(self) -> str: + return f"Wavelet(name='{self.name}')" + + +# Wavelet coefficients extracted from PyWavelets +# These are the reconstruction lowpass filter coefficients +_WAVELET_COEFFS = { + # Daubechies wavelets + "db1": [ + 0.7071067811865475244008443621048490392848359376884740365883398, + 0.7071067811865475244008443621048490392848359376884740365883398, + ], + "db2": [ + 0.4829629131445341433748715998644486838169524195042022752011715, + 0.8365163037378079055752937809168732034593703883484392934953414, + 0.2241438680420133810259727622404003554678835181842717613871683, + -0.1294095225512603811744494188120241641745344506599652569070016, + ], + "db3": [ + 0.3326705529500826159985115891390056300129233992450683597084705, + 0.80689150931333875, + 0.45987750211933132, + -0.13501102001039084, + -0.085441273882241486, + 0.035226291882100656, + ], + "db4": [ + 0.2303778133088965008632911830440708500016152482483092977910968, + 0.7148465705529156470899219552739926037076084010993081758450110, + 0.6308807679298589078817163383006152202032229226771951174057473, + -0.02798376941685985421141374718007538541198732022449175284003358, + -0.1870348117190930840795706727890814195845441743745800912057770, + 0.03084138183556076362721936253495905017031482172003403341821219, + 0.03288301166688519973540751354924438866454194113754971259727278, + -0.01059740178506903210488320852402722918109996490637641983484974, + ], + "db5": [ + 0.1601023979741929144807237480204207336505441246250578327725699, + 0.6038292697971896705401193065250621075074221631016986987969283, + 0.7243085284377729277280712441022186407687562182320073725767335, + 0.1384281459013207315053971463390246973141057911739561022694652, + -0.2422948870663820318625713794746163619914908080626185983913726, + -0.03224486958463837464847975506213492831356498416379847225434268, + 0.07757149384004571352313048938860181980623099452012527983210146, + -0.006241490212798274274190519112920192970763557165687607323417435, + -0.01258075199908199946850973993177579294920459162609785020169232, + 0.003335725285473771277998183415817355747636524742305315099706428, + ], + "db6": [ + 0.1115407433501094636213239172409234390425395919844216759082360, + 0.4946238903984530856772041768778555886377863828962743623531834, + 0.7511339080210953506789344984397316855802547833382612009730420, + 0.3152503517091976290859896548109263966495199235172945244404163, + -0.2262646939654398200763145006609034656705401539728969940143487, + -0.1297668675672619355622896058765854608452337492235814701599310, + 0.09750160558732304910234355253812534233983074749525514279893193, + 0.02752286553030572862554083950419321365738758783043454321494202, + -0.03158203931748602956507908069984866905747953237314842337511464, + 0.0005538422011614961392519183980465012206110262773864964295476524, + 0.004777257510945510639635975246820707050230501216581434297593254, + -0.001077301085308479564852621609587200035235233609334419689818580, + ], + "db7": [ + 0.07785205408500917901996352195789374837918305292795568438702937, + 0.3965393194819173065390003909368428563587151149333287401110499, + 0.7291320908462351199169430703392820517179660611901363782697715, + 0.4697822874051931224715911609744517386817913056787359532392529, + -0.1439060039285649754050683622130460017952735705499084834401753, + -0.2240361849938749826381404202332509644757830896773246552665095, + 0.07130921926683026475087657050112904822711327451412314659575113, + 0.08061260915108307191292248035938190585823820965629489058139218, + -0.03802993693501441357959206160185803585446196938467869898283122, + -0.01657454163066688065410767489170265479204504394820713705239272, + 0.01255099855609984061298988603418777957289474046048710038411818, + 0.0004295779729213665211321291228197322228235350396942409742946366, + -0.001801640704047490915268262912739550962585651469641090625323864, + 0.0003537137999745202484462958363064254310959060059520040012524275, + ], + "db8": [ + 0.05441584224310400995500940520299935503599554294733050397729280, + 0.3128715909142999706591623755057177219497319740370229185698712, + 0.6756307362972898068078007670471831499869115906336364227766759, + 0.5853546836542067127712655200450981944303266678053369055707175, + -0.01582910525634930566738054787646630415774471154502826559735335, + -0.2840155429615469265162031323741647324684350124871451793599204, + 0.0004724845739132827703605900098258949861948011288770074644084096, + 0.1287474266204784588570292875097083843022601575556488795577000, + -0.01736930100180754616961614886809598311413086529488394316977315, + -0.04408825393079475150676372323896350189751839190110996472750391, + 0.01398102791739828164872293057263345144239559532934347169146368, + 0.008746094047405776716382743246475640180402147081140676742686747, + -0.004870352993451574310422181557109824016634978512157003764736208, + -0.0003917403733769470462980803573237762675229350073890493724492694, + 0.0006754494064505693663695475738792991218489630013558432103617077, + -0.0001174767841247695337306282316988909444086693950311503927620013, + ], + "db9": [ + 0.03807794736387834658869765887955118448771714496278417476647192, + 0.2438346746125903537320415816492844155263611085609231361429088, + 0.6048231236901111119030768674342361708959562711896117565333713, + 0.6572880780513005380782126390451732140305858669245918854436034, + 0.1331973858250075761909549458997955536921780768433661136154346, + -0.2932737832791749088064031952421987310438961628589906825725112, + -0.09684078322297646051350813353769660224825458104599099679471267, + 0.1485407493381063801350727175060423024791258577280603060771649, + 0.03072568147933337921231740072037882714105805024670744781503060, + -0.06763282906132997367564227482971901592578790871353739900748331, + 0.0002509471148314519575871897499885543315176271993709633321834164, + 0.02236166212367909720537378270269095241855646688308853754721816, + -0.004723204757751397277925707848242465405729514912627938018758526, + -0.004281503682463429834496795002314531876481181811463288374860455, + 0.001847646883056226476619129491125677051121081359600318160732515, + 0.0002303857635231959672052163928245421692940662052463711972260006, + -0.0002519631889427101369749886842878606607282181543478028214134265, + 0.00003934732031627159948068988306589150707782477055517013507359938, + ], + "db10": [ + 0.02667005790055555358661744877130858277192498290851289932779975, + 0.1881768000776914890208929736790939942702546758640393484348595, + 0.5272011889317255864817448279595081924981402680840223445318549, + 0.6884590394536035657418717825492358539771364042407339537279681, + 0.2811723436605774607487269984455892876243888859026150413831543, + -0.2498464243273153794161018979207791000564669737132073715013121, + -0.1959462743773770435042992543190981318766776476382778474396781, + 0.1273693403357932600826772332014009770786177480422245995563097, + 0.09305736460357235116035228983545273226942917998946925868063974, + -0.07139414716639708714533609307605064767292611983702150917523756, + -0.02945753682187581285828323760141839199388200516064948779769654, + 0.03321267405934100173976365318215912897978337413267096043323351, + 0.003606553566956169655423291417133403299517350518618994762730612, + -0.01073317548333057504431811410651364448111548781143923213370333, + 0.001395351747052901165789318447957707567660542855688552426721117, + 0.001992405295185056117158742242640643211762555365514105280067936, + -0.0006858566949597116265613709819265714196625043336786920516211903, + -0.0001164668551292854509514809710258991891527461854347597362819235, + 0.00009358867032006959133405013034222854399688456215297276443521873, + -0.00001326420289452124481243667531226683305749240960605829756400674, + ], + # Symlets + "sym2": [0.48296291314469025, 0.83651630373746899, 0.22414386804185735, -0.12940952255092145], + "sym3": [ + 0.33267055295095688, + 0.80689150931333875, + 0.45987750211933132, + -0.13501102001039084, + -0.085441273882241486, + 0.035226291882100656, + ], + "sym4": [ + 0.032223100604042702, + -0.012603967262037833, + -0.099219543576847216, + 0.29785779560527736, + 0.80373875180591614, + 0.49761866763201545, + -0.02963552764599851, + -0.075765714789273325, + ], + "sym5": [ + 0.019538882735286728, + -0.021101834024758855, + -0.17532808990845047, + 0.016602105764522319, + 0.63397896345821192, + 0.72340769040242059, + 0.1993975339773936, + -0.039134249302383094, + 0.029519490925774643, + 0.027333068345077982, + ], + "sym6": [ + -0.007800708325034148, + 0.0017677118642428036, + 0.044724901770665779, + -0.021060292512300564, + -0.072637522786462516, + 0.3379294217276218, + 0.787641141030194, + 0.49105594192674662, + -0.048311742585632998, + -0.11799011114819057, + 0.0034907120842174702, + 0.015404109327027373, + ], + "sym7": [ + 0.010268176708511255, + 0.0040102448715336634, + -0.10780823770381774, + -0.14004724044296152, + 0.28862963175151463, + 0.76776431700316405, + 0.5361019170917628, + 0.017441255086855827, + -0.049552834937127255, + 0.067892693501372697, + 0.03051551316596357, + -0.01263630340325193, + -0.0010473848886829163, + 0.0026818145682578781, + ], + "sym8": [ + 0.0018899503327594609, + -0.0003029205147213668, + -0.014952258337048231, + 0.0038087520138906151, + 0.049137179673607506, + -0.027219029917056003, + -0.051945838107709037, + 0.3644418948353314, + 0.77718575170052351, + 0.48135965125837221, + -0.061273359067658524, + -0.14329423835080971, + 0.0076074873249176054, + 0.031695087811492981, + -0.00054213233179114812, + -0.0033824159510061256, + ], + "sym9": [ + 0.0010694900329086053, + -0.00047315449868008311, + -0.010264064027633142, + 0.0088592674934004842, + 0.06207778930288603, + -0.018233770779395985, + -0.19155083129728512, + 0.035272488035271894, + 0.61733844914093583, + 0.717897082764412, + 0.238760914607303, + -0.054568958430834071, + 0.00058346274612580684, + 0.03022487885827568, + -0.01152821020767923, + -0.013271967781817119, + 0.00061978088898558676, + 0.0014009155259146807, + ], + "sym10": [ + -0.00045932942100465878, + 0.000057036083618494284, + 0.0045931735853118284, + -0.00080435893201654491, + -0.02035493981231129, + 0.0057649120335819086, + 0.049994972077376687, + -0.0319900568824278, + -0.035536740473817552, + 0.38382676106708546, + 0.7695100370211071, + 0.47169066693843925, + -0.070880535783243853, + -0.15949427888491757, + 0.011609893903711381, + 0.045927239231092203, + -0.0014653825813050513, + -0.0086412992770224222, + 0.000095632670722894754, + 0.00077015980911449011, + ], + # Coiflets (note: these will be multiplied by sqrt(2) in the class) + "coif1": [ + -0.05142972847076845595317549230122688830344559947132656813651045, + 0.2389297284707684559531754923012268883034455994713265681365104, + 0.6028594569415369119063509846024537766068911989426531362730209, + 0.2721405430584630880936490153975462233931088010573468637269790, + -0.05142972847076845595317549230122688830344559947132656813651045, + -0.01107027152923154404682450769877311169655440052867343186348954, + ], + "coif2": [ + 0.01158759673871686817889714882853120395708315073355502818875931, + -0.02932013798346856448679594524397843054053420947418409889774786, + -0.04763959031100813225872995081511549408622753909592460525840745, + 0.2730210465347666137982239328923516270034828327990699588033501, + 0.5746823938568638472459483149751499367740786490481481391460366, + 0.2948671936956191896750637208703777973914107635455611537640778, + -0.05408560709171142997443672832006888537570221990444706777525838, + -0.04202648046077160694657530752545884878978719268926222513485613, + 0.01674441016327950635146257083249391698866289538037299820224006, + 0.003967883612962012109043447090269950094081810916481648252817197, + -0.001289203356140659543141355500990678257894936161704492503370186, + -0.0005095053991076441489598480835620951586540050976664367876412655, + ], + "coif3": [ + -0.002682418670922068664584689955153722375535836177157637134187840, + 0.005503126707831385107969640263617469178794666057252906037981936, + 0.01658356047917034608134280439996549525220639437145367606178002, + -0.04650776447872697640390293095170192691113917841041002855534619, + -0.04322076356021191118175840907244577856782537221435748296465882, + 0.2865033352736474630249006862976158896891076238443844211133873, + 0.5612852568703300445990941995240077241406247774064453800050914, + 0.3029835717728241602862575774374668529867757043461413348549577, + -0.05077014075488886159516471867138370972545857441670871832472707, + -0.05819625076158553022607041679522801089624825903982541419721721, + 0.02443409432116695639462954438418928805487699080947974989338820, + 0.01122924096203786563399489540091488781245346096838814728167341, + -0.006369601011048822977293753932627342482077585617391852852955559, + -0.001820458915566242322836631665832145136570132777862391313328351, + 0.0007902051009575939937150950543290226440287715441826917281929124, + 0.0003296651737931830308416338897758022998655744276957481989605186, + -0.00005019277455327664998007173088097694083956570594580641192332170, + -0.00002446573425530813115445387662881902303945941576472342106918209, + ], + "coif4": [ + 0.0006309612114309468490753696608619526520153127603444406835368201, + -0.001152225143769973488683007937016166047881572156705066038094891, + -0.005194525163470323267558201363327294331811309729430512113592118, + 0.011360930899781950641704454327495718441159520023894304805142070, + 0.018945061045616642675204041814669158097013442370604397885045773, + -0.051719843705815280952009072709014825996085808127950893370164031, + -0.034486140470944806827159094088779177962124655341862998060866093, + 0.30227251053656843537076103037201073987915654650542997843779746, + 0.55454790624088107896085831311334062609863843227892842936901802, + 0.30791766802517503548651698686002846493302655084140026096325632, + -0.04352500928126570063143077306027663648139777048547894956715080, + -0.06488795097143100103160862688937301504802507374726020928892066, + 0.01988077364815951966984001670075537628468542316950829728327598, + 0.01763894787126169746077061344050946967036166456074020965866088, + -0.007366054847173363935072651649653007115003169492027095040477055, + -0.002312432307658842282830374733100847689924654369899030169556169, + 0.0014260063442333715226509754100697734398974715092509045804651032, + 0.0004666984635537353670445650012678936080062341977092967649055398, + -0.0001270007842387334077388950072420113055088253899932456267893098, + -0.0001130536369789104919020013936507623832962772709844179610938550, + 0.00003048364879677801030096883509693508426509710688913073244616617, + 0.00001266744808933008194725929652978169473830765616675686100903086, + -0.000001584926580756893754069651095690142946796090146306140001598, + -0.000001123948088281542889088159169056968300680087779667334879506, + ], + "coif5": [ + -0.0001444992186438190986841213894961515720877049723502928655308158, + 0.0002541649492011946935899015644804259825374993423205648946709984, + 0.0015016192805175522217354963668928299350735326077949346507003370, + -0.0029411108712655515426850089360913424188662278991737055486839309, + -0.0071777671514877191801104649507158618871157411936681659380839993, + 0.016680426640070654149267486742006854522334094142598667043628439, + 0.019433238433489604119639447772308536988043628308900006988094899, + -0.064934946567212502582522008002547701764467194128935170823607736, + -0.036249793089132571825087765037251085892962369926089901862924065, + 0.29804266217809436069693444260411251439893892734398765007426945, + 0.55749162970920071628061190166750547398568080072951806509736879, + 0.30731644529206781686031633026138686170779831068030092889493625, + -0.047088034719761145117688715152051398948700623993077406913889346, + -0.068890522508050074805015336128652797797076949077503388892816063, + 0.020697343297747766068568936830651656003659188170019885439659031, + 0.021640668655956855043817421090949779825140639715020046717736369, + -0.0081089373078953680936950024508066654697766705721301481097854397, + -0.0049881737671041853808073796089816945023009226058734090095808033, + 0.0024486914321021269742893936892468103370072825113159100554056433, + 0.0014095103899593442621166984842002926701899968616244946547893994, + -0.0005637801876093825733169550088901318936072015721509885859509815, + -0.0002859004477225750899655442618734663056802618537327806113618985, + 0.00012739637513815208006169426577159456832051015616166327985688948, + 0.00005416263410701044073894700796327336007788688985721449717765655, + -0.00001736867944346280636144226913926838103159698473080996002509476, + -0.00001392656190060010871169838885327726938969652863554900825705905, + 0.000003582065515946048838215026334503092089635988710863959063568069, + 0.000001914022895847318655772885654240700542388103264097264264779554, + -0.00000031262488377016899432194683906058825900951892071223097080609, + -0.00000034030635502511647536690616071863203084936306302829968850306, + 0.000000059816065238516936893488966688516710847096926983547983503726, + 0.000000047001427849456491830476615736016736014244615701046223529866, + -0.000000006158615709678364180659098549671046676203853020063205641804, + -0.000000009225635096344935080070901936862847863830913641424076095562, + 0.000000001028486074518821265015830073593127726988903862842106883701, + 0.000000001168734175186263778695686067593866982925127816327529890618, + -0.00000000009468626176069127302554946536142654377756003084491946024, + -0.00000000016230233142152041788509334089966065953985768924968863072, + 0.000000000015076656859346950325398899897135970089618140503825462985, + 0.000000000015770990416421915106306877550025550097686639869166742016, + -0.000000000001084900468648598127623517893686893316653633996513097476, + -0.000000000001968659779411804788815966829825641065085077654946686012, + 0.000000000000098745634639726633264577838416327095717894829823436076, + 0.000000000000196734781460508097097473336847436635654948853090962606, + -0.000000000000008021080145299890797556481653726965016924825037889883, + -0.000000000000021030408801651651406095853493993966926736862877194669, + 0.000000000000000723888697830915633925166893301949334507697669655816, + 0.000000000000001943208515072761516084547140065815027641765976721267, + ], +} + + +def wavelist() -> list[str]: + """Return a list of available wavelet names.""" + return list(_WAVELET_COEFFS.keys()) + ["haar"] diff --git a/peft/src/peft/tuners/waveft/waverec2d.py b/peft/src/peft/tuners/waveft/waverec2d.py new file mode 100644 index 0000000000000000000000000000000000000000..e05f5dc5471d07bb2a7b3c98272d0f30965fb0c3 --- /dev/null +++ b/peft/src/peft/tuners/waveft/waverec2d.py @@ -0,0 +1,316 @@ +# Copyright 2021 Moritz Wolter +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the EUPL v1.2 +# +# This file contains code derived from PyTorch-Wavelet-Toolbox: +# https://github.com/v0lta/PyTorch-Wavelet-Toolbox +# +# Original work by Moritz Wolter, licensed under EUPL v1.2 +# Modifications and integration by HuggingFace Inc. team + +from collections.abc import Callable, Sequence +from functools import partial +from typing import Any, NamedTuple, Protocol, Union, cast, overload + +import numpy as np +import torch +from typing_extensions import TypeAlias, Unpack + +from .wavelet import Wavelet as minimal_wavelet + + +class WaveletDetailTuple2d(NamedTuple): + horizontal: torch.Tensor + vertical: torch.Tensor + diagonal: torch.Tensor + + +WaveletCoeff2d: TypeAlias = tuple[torch.Tensor, Unpack[tuple[WaveletDetailTuple2d, ...]]] +WaveletDetailDict: TypeAlias = dict[str, torch.Tensor] +WaveletCoeffNd: TypeAlias = tuple[torch.Tensor, Unpack[tuple[WaveletDetailDict, ...]]] + + +class Wavelet(Protocol): + name: str + dec_lo: Sequence[float] + dec_hi: Sequence[float] + rec_lo: Sequence[float] + rec_hi: Sequence[float] + dec_len: int + rec_len: int + filter_bank: tuple[Sequence[float], Sequence[float], Sequence[float], Sequence[float]] + + def __len__(self) -> int: + return len(self.dec_lo) + + +class WaveletTensorTuple(NamedTuple): + dec_lo: torch.Tensor + dec_hi: torch.Tensor + rec_lo: torch.Tensor + rec_hi: torch.Tensor + + @classmethod + def from_wavelet(cls, wavelet: Wavelet, dtype: torch.dtype) -> "WaveletTensorTuple": + return cls( + torch.tensor(wavelet.dec_lo, dtype=dtype), + torch.tensor(wavelet.dec_hi, dtype=dtype), + torch.tensor(wavelet.rec_lo, dtype=dtype), + torch.tensor(wavelet.rec_hi, dtype=dtype), + ) + + +def _as_wavelet(wavelet: Union[Wavelet, str]) -> Wavelet: + if isinstance(wavelet, str): + return minimal_wavelet(wavelet) + else: + return wavelet + + +def _is_dtype_supported(dtype: torch.dtype) -> bool: + return dtype in [torch.float16, torch.bfloat16, torch.float32, torch.float64] + + +def _outer(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: + a_flat = torch.reshape(a, [-1]) + b_flat = torch.reshape(b, [-1]) + a_mul = torch.unsqueeze(a_flat, dim=-1) + b_mul = torch.unsqueeze(b_flat, dim=0) + return a_mul * b_mul + + +def _check_if_tensor(array: Any) -> torch.Tensor: + if not isinstance(array, torch.Tensor): + raise ValueError("First element of coeffs must be the approximation coefficient tensor.") + return array + + +def _check_axes_argument(axes: Sequence[int]) -> None: + if len(set(axes)) != len(axes): + raise ValueError("Cant transform the same axis twice.") + + +def _check_same_device(tensor: torch.Tensor, torch_device: torch.device) -> torch.Tensor: + if torch_device != tensor.device: + raise ValueError("coefficients must be on the same device") + return tensor + + +def _check_same_dtype(tensor: torch.Tensor, torch_dtype: torch.dtype) -> torch.Tensor: + if torch_dtype != tensor.dtype: + raise ValueError("coefficients must have the same dtype") + return tensor + + +@overload +def _coeff_tree_map( + coeffs: list[torch.Tensor], function: Callable[[torch.Tensor], torch.Tensor] +) -> list[torch.Tensor]: ... +@overload +def _coeff_tree_map(coeffs: WaveletCoeff2d, function: Callable[[torch.Tensor], torch.Tensor]) -> WaveletCoeff2d: ... +@overload +def _coeff_tree_map(coeffs: WaveletCoeffNd, function: Callable[[torch.Tensor], torch.Tensor]) -> WaveletCoeffNd: ... +def _coeff_tree_map(coeffs, function): + approx = function(coeffs[0]) + result_lst: list[Any] = [] + for element in coeffs[1:]: + if isinstance(element, tuple): + result_lst.append(WaveletDetailTuple2d(function(element[0]), function(element[1]), function(element[2]))) + elif isinstance(element, dict): + new_dict = {key: function(value) for key, value in element.items()} + result_lst.append(new_dict) + elif isinstance(element, torch.Tensor): + result_lst.append(function(element)) + else: + raise ValueError(f"Unexpected input type {type(element)}") + if not result_lst: + return [approx] if isinstance(coeffs, list) else (approx,) + elif isinstance(result_lst[0], torch.Tensor): + return [approx] + cast(list[torch.Tensor], result_lst) + else: + cast_result_lst = cast(Union[list[WaveletDetailDict], list[WaveletDetailTuple2d]], result_lst) + return (approx, *cast_result_lst) + + +def _check_same_device_dtype( + coeffs: Union[list[torch.Tensor], WaveletCoeff2d, WaveletCoeffNd], +) -> tuple[torch.device, torch.dtype]: + c = _check_if_tensor(coeffs[0]) + torch_device, torch_dtype = c.device, c.dtype + _coeff_tree_map(coeffs, partial(_check_same_device, torch_device=torch_device)) + _coeff_tree_map(coeffs, partial(_check_same_dtype, torch_dtype=torch_dtype)) + return torch_device, torch_dtype + + +def _get_transpose_order(axes: Sequence[int], data_shape: Sequence[int]) -> tuple[list[int], list[int]]: + axes = [a + len(data_shape) if a < 0 else a for a in axes] + all_axes = list(range(len(data_shape))) + remove_transformed = list(filter(lambda a: a not in axes, all_axes)) + return remove_transformed, axes + + +def _swap_axes(data: torch.Tensor, axes: Sequence[int]) -> torch.Tensor: + _check_axes_argument(axes) + front, back = _get_transpose_order(axes, list(data.shape)) + return torch.permute(data, front + back) + + +def _undo_swap_axes(data: torch.Tensor, axes: Sequence[int]) -> torch.Tensor: + _check_axes_argument(axes) + front, back = _get_transpose_order(axes, list(data.shape)) + restore_sorted = torch.argsort(torch.tensor(front + back)).tolist() + return torch.permute(data, restore_sorted) + + +def _fold_axes(data: torch.Tensor, keep_no: int) -> tuple[torch.Tensor, list[int]]: + dshape = list(data.shape) + return (torch.reshape(data, [int(np.prod(dshape[:-keep_no]))] + dshape[-keep_no:]), dshape) + + +def _unfold_axes(data: torch.Tensor, ds: list[int], keep_no: int) -> torch.Tensor: + return torch.reshape(data, ds[:-keep_no] + list(data.shape[-keep_no:])) + + +def _preprocess_coeffs(coeffs, ndim: int, axes, add_channel_dim: bool = False): + if isinstance(axes, int): + axes = (axes,) + torch_dtype = _check_if_tensor(coeffs[0]).dtype + if not _is_dtype_supported(torch_dtype): + raise ValueError(f"Input dtype {torch_dtype} not supported") + if ndim <= 0: + raise ValueError("Number of dimensions must be positive") + if tuple(axes) != tuple(range(-ndim, 0)): + if len(axes) != ndim: + raise ValueError(f"{ndim}D transforms work with {ndim} axes.") + else: + swap_fn = partial(_swap_axes, axes=axes) + coeffs = _coeff_tree_map(coeffs, swap_fn) + ds = list(coeffs[0].shape) + if len(ds) < ndim: + raise ValueError(f"At least {ndim} input dimensions required.") + elif len(ds) == ndim: + coeffs = _coeff_tree_map(coeffs, lambda x: x.unsqueeze(0)) + elif len(ds) > ndim + 1: + coeffs = _coeff_tree_map(coeffs, lambda t: _fold_axes(t, ndim)[0]) + if add_channel_dim: + coeffs = _coeff_tree_map(coeffs, lambda x: x.unsqueeze(1)) + return coeffs, ds + + +def _postprocess_coeffs(coeffs, ndim: int, ds: list[int], axes): + if isinstance(axes, int): + axes = (axes,) + if ndim <= 0: + raise ValueError("Number of dimensions must be positive") + if len(ds) < ndim: + raise ValueError(f"At least {ndim} input dimensions required.") + elif len(ds) == ndim: + coeffs = _coeff_tree_map(coeffs, lambda x: x.squeeze(0)) + elif len(ds) > ndim + 1: + unfold_axes_fn = partial(_unfold_axes, ds=ds, keep_no=ndim) + coeffs = _coeff_tree_map(coeffs, unfold_axes_fn) + if tuple(axes) != tuple(range(-ndim, 0)): + if len(axes) != ndim: + raise ValueError(f"{ndim}D transforms work with {ndim} axes.") + else: + undo_swap_fn = partial(_undo_swap_axes, axes=axes) + coeffs = _coeff_tree_map(coeffs, undo_swap_fn) + return coeffs + + +def _postprocess_tensor( + data: torch.Tensor, ndim: int, ds: list[int], axes: Union[tuple[int, ...], int] +) -> torch.Tensor: + return _postprocess_coeffs(coeffs=[data], ndim=ndim, ds=ds, axes=axes)[0] + + +def _get_filter_tensors( + wavelet: Union[Wavelet, str], flip: bool, device: torch.device, dtype: torch.dtype +) -> WaveletTensorTuple: + wavelet = _as_wavelet(wavelet) + if flip: + filters = WaveletTensorTuple( + torch.tensor(wavelet.rec_lo, device=device, dtype=dtype), + torch.tensor(wavelet.rec_hi, device=device, dtype=dtype), + torch.tensor(wavelet.dec_lo, device=device, dtype=dtype), + torch.tensor(wavelet.dec_hi, device=device, dtype=dtype), + ) + else: + filters = WaveletTensorTuple.from_wavelet(wavelet, dtype=dtype) + filters = WaveletTensorTuple( + filters.dec_lo.to(device), + filters.dec_hi.to(device), + filters.rec_lo.to(device), + filters.rec_hi.to(device), + ) + return filters + + +def _adjust_padding_at_reconstruction(tensor_len: int, coeff_len: int, padr: int, padl: int) -> tuple[int, int]: + if 2 * coeff_len - tensor_len == 1: + padr += 1 + elif 2 * coeff_len - tensor_len != 0: + raise ValueError("incorrect padding") + return padr, padl + + +def _construct_2d_filt(lo: torch.Tensor, hi: torch.Tensor) -> torch.Tensor: + ll = _outer(lo, lo) + lh = _outer(hi, lo) + hl = _outer(lo, hi) + hh = _outer(hi, hi) + filt = torch.stack([ll, lh, hl, hh], 0) + filt = filt.unsqueeze(1) + return filt + + +def waverec2d( + coeffs: WaveletCoeff2d, + wavelet: Union[Wavelet, str], + axes: tuple[int, int] = (-2, -1), +) -> torch.Tensor: + coeffs, ds = _preprocess_coeffs(coeffs, ndim=2, axes=axes) + torch_device, torch_dtype = _check_same_device_dtype(coeffs) + + _, _, rec_lo, rec_hi = _get_filter_tensors(wavelet, flip=False, device=torch_device, dtype=torch_dtype) + filt_len = rec_lo.shape[-1] + rec_filt = _construct_2d_filt(lo=rec_lo, hi=rec_hi) + + res_ll = coeffs[0] + for c_pos, coeff_tuple in enumerate(coeffs[1:]): + if not isinstance(coeff_tuple, tuple) or len(coeff_tuple) != 3: + raise ValueError(f"Unexpected detail coefficient type: {type(coeff_tuple)}. Must be a 3-tuple.") + + curr_shape = res_ll.shape + for coeff in coeff_tuple: + if coeff.shape != curr_shape: + raise ValueError("All coefficients on each level must have the same shape") + + res_lh, res_hl, res_hh = coeff_tuple + res_ll = torch.stack([res_ll, res_lh, res_hl, res_hh], 1) + res_ll = torch.nn.functional.conv_transpose2d(res_ll, rec_filt, stride=2).squeeze(1) + + padl = (2 * filt_len - 3) // 2 + padr = (2 * filt_len - 3) // 2 + padt = (2 * filt_len - 3) // 2 + padb = (2 * filt_len - 3) // 2 + if c_pos < len(coeffs) - 2: + padr, padl = _adjust_padding_at_reconstruction( + res_ll.shape[-1], coeffs[c_pos + 2][0].shape[-1], padr, padl + ) + padb, padt = _adjust_padding_at_reconstruction( + res_ll.shape[-2], coeffs[c_pos + 2][0].shape[-2], padb, padt + ) + + if padt > 0: + res_ll = res_ll[..., padt:, :] + if padb > 0: + res_ll = res_ll[..., :-padb, :] + if padl > 0: + res_ll = res_ll[..., padl:] + if padr > 0: + res_ll = res_ll[..., :-padr] + + res_ll = _postprocess_tensor(res_ll, ndim=2, ds=ds, axes=axes) + return res_ll diff --git a/peft/src/peft/tuners/xlora/__init__.py b/peft/src/peft/tuners/xlora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6eae1f779b81e883f1dd64e3a4fca859391836c5 --- /dev/null +++ b/peft/src/peft/tuners/xlora/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import XLoraConfig +from .model import XLoraModel + + +__all__ = ["XLoraConfig", "XLoraModel"] + +register_peft_method(name="xlora", config_cls=XLoraConfig, model_cls=XLoraModel) diff --git a/peft/src/peft/tuners/xlora/classifier.py b/peft/src/peft/tuners/xlora/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccf9edf9db3178ba9dc4dd3e07989251a676cb8 --- /dev/null +++ b/peft/src/peft/tuners/xlora/classifier.py @@ -0,0 +1,195 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import builtins +from typing import Optional, Union + +import torch +import torch.nn as nn + +from .config import XLoraConfig + + +Number = Union[builtins.int, builtins.float, builtins.bool] + + +class TemperatureScaledSoftmax(nn.Module): + def __init__(self, temperature=1.0): + super().__init__() + self.temperature = temperature + self.softmax = nn.Softmax(dim=-1) + + def forward(self, logits): + # Scale logits by the temperature + scaled_logits = logits / self.temperature + # Apply softmax to the scaled logits + return self.softmax(scaled_logits) + + +class XLoraClassifier(nn.Module): + """ + A classifier to select LoRA layers for XLora. + """ + + def __init__( + self, + model: nn.Module, # PeftModel + config: XLoraConfig, + n_classes: int, + n_layers: int, + device: torch.device, + ): + """ + Construct an X-LoRA classifier from a model, config and some metadata. Note that n_layers is the number of LoRA + adapter layers, not the number of model layers. + """ + super().__init__() + + self.n_classes = n_classes + self.n_layers = n_layers + self.config = config + self.log_scalings = [] + self.softmax = TemperatureScaledSoftmax(temperature=self.config.softmax_temperature) + self.override_scaling_pass_value: Number = config.scaling_pass_value + + self.scalings_logging = False + + self.dtype = next(model.parameters()).dtype + add_dropout = config.xlora_dropout_p > 0.0 + + layers = [] + if self.config.xlora_depth == 1: + if config.layerwise_scalings: # bias=False if we have just one layer + last = nn.Linear(config.hidden_size, n_classes * n_layers, bias=True).to(device).to(self.dtype) + else: + last = nn.Linear(config.hidden_size, n_classes, bias=True).to(device).to(self.dtype) + else: + if self.config.xlora_depth <= 0: + raise ValueError("X-LoRA depth must be strictly positive.") + + layers.append(nn.Linear(config.hidden_size, config.xlora_size, bias=True).to(device).to(self.dtype)) + + layers.append(nn.ReLU()) + if add_dropout: + layers.append(nn.Dropout(p=config.xlora_dropout_p)) + + for _ in range(config.xlora_depth - 2): + layers.append(nn.Linear(config.xlora_size, config.xlora_size, bias=True).to(device).to(self.dtype)) + + layers.append(nn.ReLU()) + if add_dropout: + layers.append(nn.Dropout(p=config.xlora_dropout_p)) + + if config.layerwise_scalings: + last = nn.Linear(config.xlora_size, n_classes * n_layers, bias=True).to(device).to(self.dtype) + else: + last = nn.Linear(config.xlora_size, n_classes, bias=True).to(device).to(self.dtype) + self.layers = nn.Sequential(*layers, last) + + def make_dummy_scalings( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + *args, + **kwargs, + ) -> torch.Tensor: + """ + Make some dummy scalings for the scalings pass (the one to get the logits for the X-LoRA classifier). These are + of shape (batch_size, seq_len, n_layers, n_classes) and filled with the override scalings pass value. Note that + n_layers is the number of LoRA adapter layers, not the number of model layers. + """ + if input_ids is not None: + batch_size = input_ids.shape[0] + device = input_ids.device + seq_len = input_ids.shape[1] + else: + batch_size = inputs_embeds.shape[0] + device = inputs_embeds.device + seq_len = inputs_embeds.shape[1] + + return torch.full( # type: ignore + (batch_size, seq_len, self.n_layers, self.n_classes), + self.override_scaling_pass_value, + ).to(device=device, dtype=self.dtype) + + def forward( + self, + result, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + *args, + **kwargs, + ) -> torch.Tensor: + """ + Using the hidden states of the model, predict `n_classes` LoRA alpha values. Returns the scalings. + """ + if input_ids is not None: + batch_size = input_ids.shape[0] + seq_len = input_ids.shape[1] + else: + batch_size = inputs_embeds.shape[0] + seq_len = inputs_embeds.shape[1] + + hidden_states = result.hidden_states # type: ignore + + hidden_state = hidden_states[-1] # Get the last hidden state + + ### Classifier run + # hidden_state=[batch_size, seq_len, hidden_size] + logits = self.layers.forward(hidden_state) + + ### Repeat to make layerwise scalings + ### If layerwise_scalings=False, then the classifier only outputs logits which are not layer-wise. + ### So, we expand them to the correct shape. + if not self.config.layerwise_scalings: + logits = logits.unsqueeze(2) + logits = logits.expand(-1, -1, self.n_layers, -1) + + ### Classifier run + + scalings = logits.reshape(batch_size, seq_len, self.n_layers, self.n_classes) + # scalings = [batch_size, seq_len, n_layers, n_classes] + + if self.config.enable_softmax: + scalings = self.softmax(scalings) + + if self.scalings_logging: + self.log_scalings.append(scalings) + + return scalings + + def _get_bucketed_scalings(self) -> dict[int, tuple[list[int], list[torch.Tensor]]]: + """ + Returns bucketed scalings, bucketed by seq_len. Each value consists of the positions (the first) and the + associated tensors. The positions are paired with the associated tensors and give the position in the scaling + log. Each scaling is a tensor of shape (batch_size, seq_len, n_layers, n_classes)). + """ + seqlens_map: dict[int, tuple[list[int], list[torch.Tensor]]] = {} + for i, scaling in enumerate(self.log_scalings): + seq_len = scaling.shape[1] + if seq_len not in seqlens_map: + seqlens_map[seq_len] = ([i], [scaling]) + else: + seqlens_map[seq_len][0].append(i) + seqlens_map[seq_len][1].append(scaling) + + return seqlens_map + + def _set_override_scaling_pass_value(self, value: Union[Number, None]): + if value is None: + self.override_scaling_pass_value = 1 / self.n_classes + else: + self.override_scaling_pass_value = value + self.config.scaling_pass_value = self.override_scaling_pass_value diff --git a/peft/src/peft/tuners/xlora/config.py b/peft/src/peft/tuners/xlora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9cdb0f6e49f367c640ef4abf14e29b3b60f22fa6 --- /dev/null +++ b/peft/src/peft/tuners/xlora/config.py @@ -0,0 +1,102 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import Optional + +from peft.config import PeftConfig +from peft.utils.peft_types import PeftType + + +@dataclass +class XLoraConfig(PeftConfig): + r""" + This is the configuration class to store the configuration of a `XLoraModel`. When the config is reloaded, the + paths of the `adapters` field is disregarded in favor of the saved adapters. As such, only the keys matter during + loading. + + Args: + hidden_size (`int`): + Hidden size of the base model. + adapters (`dict`): + Mapping of adapter names to the LoRA adapter id, as per PeftModel.load_adapter. *They will be automatically + loaded*, to use as LoRA experts. When using from_pretrained, pass the new adapters dict as a keyword + argument. + enable_softmax (`bool`, *optional*, defaults to `True`): + Enable softmax application for the X-LoRA classifier. + enable_softmax_topk (`bool`, *optional*, defaults to `False`): + Enable softmax application for the top-k LoRA adapters. Mutually exclusive to `enable_softmax` and must + only be set if `top_k_lora` is. + softmax_temperature (`float`, *optional*, defaults to 1.0): + Softmax temperature, lower yields sharper predictions + layerwise_scalings (`bool`, *optional*, defaults to `False`): + If True, generate scalings for each LoRA adapter (each layer). If this is False, then scalings will be + broadcasted, the same, to each layer. + top_k_lora (`int`, *optional*, defaults to None): + Sparsely select the top_k LoRA experts instead of the default dense method. + xlora_depth (`int`, *optional*, defaults to 1): + Depth of the X-LoRA classifier. + xlora_size (`int`, *optional*, defaults to 2048): + Hidden size of the X-LoRA classifier, irrelevant if `xlora_depth=1`. + xlora_dropout_p (`float`, *optional*, defaults to 0.2): + Dropout probability of the X-LoRA classifier, irrelevant if `xlora_depth=1`. + use_trainable_adapters (`bool`, *optional*, defaults to False): + Make the adapters trainable. + scaling_pass_value (`float`, *optional*, defaults to 0): + Scaling pass value. + global_scaling_weight (`float`, *optional*, defaults to 1): + Weight to multiply output of each LoRA adapter by. + """ + + hidden_size: int = None # type: ignore + adapters: dict[str, str] = None # type: ignore + enable_softmax: bool = True + enable_softmax_topk: bool = False + layerwise_scalings: bool = False + xlora_depth: int = 1 + xlora_size: int = 2048 + xlora_dropout_p: float = 0.2 + use_trainable_adapters: bool = False + softmax_temperature: float = 1.0 + top_k_lora: Optional[int] = None + scaling_pass_value: float = 0.0 + global_scaling_weight: float = 1.0 + + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.XLORA + + if self.hidden_size is None: + warnings.warn( + "No value was provided for `hidden_size`. This will be set to 4096 by default, please ensure that this is correct." + ) + self.hidden_size = 4096 + if self.adapters is None: + warnings.warn( + "No value was provided for for `adapters`. This will be set to empty, please ensure that this is correct." + ) + self.adapters = {} + + if self.enable_softmax_topk and self.top_k_lora is None: + warnings.warn("`enable_softmax_topk` enabled `top_k_lora` is not set") + + if self.enable_softmax_topk and self.enable_softmax: + warnings.warn( + "`enable_softmax_topk` and `enable_softmax` are both enabled. This will result in worse performance." + ) + + if self.top_k_lora is not None and self.top_k_lora < 1: + warnings.warn("`top_k_lora` value must be at least 1.") diff --git a/peft/src/peft/tuners/xlora/layer.py b/peft/src/peft/tuners/xlora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..bf2afcd58939815e59eacef0e10d0c5757d523e6 --- /dev/null +++ b/peft/src/peft/tuners/xlora/layer.py @@ -0,0 +1,225 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Any, Callable, Optional + +import torch +import torch.nn as nn +from torch import Tensor + +from peft.tuners import lora + +from .config import XLoraConfig + + +class XLoraLayer: + """ + A XLoraLayer wraps any LoraLayer and performs the XLora operation on the LoRA adaptors specified. Its primary API + is the forward method, which uses the scalings to execute the XLora algorithm. + """ + + def __init__( + self, + model: nn.Module, # XLoraModel + target: lora.LoraLayer, + target_forward: Callable[..., Any], + layer_number: int, + config: XLoraConfig, + ) -> None: + self.model = model + self.target_forward = target_forward + self.target = target + self.layer_number = layer_number + self.config = config + + """ + Apply the scalings for the adapter. + """ + + @staticmethod + def apply_scalings_to_x(x: torch.Tensor, scalings_layer: torch.Tensor, adapter: int) -> torch.Tensor: + # scalings_layer = [batch_size, seq_len, n_classes] + scalings = scalings_layer[:, :, adapter].unsqueeze(-1) + # scalings_layer = [batch_size, seq_len, 1] + return x * scalings + + """ + Get the scalings for this layer, potentially applying topk and topk+softmax. This is called before + `apply_scalings_to_x` + """ + + def get_maybe_topk_scalings(self, scalings) -> torch.Tensor: + # xlora_scalings = [batch_size, seq_len, n_classes] + xlora_scalings: Tensor = scalings[:, :, self.layer_number, :] # type: ignore + + if self.config.top_k_lora is not None: + _, topk_indices = torch.topk(xlora_scalings, k=self.config.top_k_lora, dim=-1) + + # Mask the topk to True, the rest to False + mask = torch.zeros_like(xlora_scalings, dtype=torch.bool) + mask.scatter_(-1, topk_indices, True) + + xlora_scalings = xlora_scalings * mask.to(xlora_scalings.dtype) + + # Apply per-token normalization to the xLoRA scaling factors using a softmax + if self.config.enable_softmax_topk: + nonzero_mask = xlora_scalings != 0 + full = xlora_scalings.masked_fill(~nonzero_mask, float("-inf")) + new_scalings = torch.softmax(full, dim=-1) + xlora_scalings = new_scalings.masked_fill(~nonzero_mask, 0.0) + + return xlora_scalings + + +class XLoraLinearLayer(XLoraLayer): + def __init__( + self, + model: nn.Module, + target: lora.Linear, + target_forward: Callable[..., Any], + layer_number: int, + config: XLoraConfig, + ) -> None: + super().__init__(model, target, target_forward, layer_number, config) + + def forward(self, x: Tensor, *args: Any, scalings: Optional[Tensor] = None, **kwargs: Any) -> Tensor: + """ + This method is designed to be a drop-in-replacement for the LoRA layers' .forward method. To use it, a bound + method must be created (bound to an instance of the XLoraLayer class). + """ + + previous_dtype = x.dtype + if scalings is not None: + xlora_scalings = self.get_maybe_topk_scalings(scalings) + + result = self.target.base_layer(x, *args, **kwargs) + + # Ignore if disabled. We want to make sure this is always run. + if not self.target.merged: + for adapter_n, active_adapter in enumerate(self.target.active_adapters): + if active_adapter not in self.target.lora_A.keys(): + continue + # TODO: implement X-LoRA with Lora+Dora layers + if self.target.use_dora[active_adapter]: + raise ValueError("X-LoRA currently does not support LoRA layers with DoRA") + lora_A = self.target.lora_A[active_adapter] + lora_B = self.target.lora_B[active_adapter] + dropout = self.target.lora_dropout[active_adapter] + scaling = self.target.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) # type: ignore + if scalings is not None: + x_mod = self.apply_scalings_to_x(x, xlora_scalings, adapter_n) + scaling_weight = self.config.global_scaling_weight + else: + x_mod = x + scaling_weight = 1 + result += lora_B(lora_A(dropout(x_mod))) * scaling * scaling_weight + + result = result.to(previous_dtype) + return result + + +class XLoraEmbeddingLayer(XLoraLayer): + def __init__( + self, + model: nn.Module, + target: lora.Embedding, + target_forward: Callable[..., Any], + layer_number: int, + config: XLoraConfig, + ) -> None: + super().__init__(model, target, target_forward, layer_number, config) + + def forward(self, x: Tensor, *args: Any, scalings: Optional[Tensor] = None, **kwargs: Any) -> Tensor: + """ + This method is designed to be a drop-in-replacement for the LoRA layers' .forward method. To use it, a bound + method must be created (bound to an instance of the XLoraLayer class). + """ + + if scalings is not None: + xlora_scalings = self.get_maybe_topk_scalings(scalings) + + result = self.target.base_layer(x, *args, **kwargs) + + # Ignore if disabled. We want to make sure this is always run. + if not self.target.merged: + for adapter_n, active_adapter in enumerate(self.target.active_adapters): + if active_adapter not in self.target.lora_embedding_A: + continue + # TODO: implement X-LoRA with Lora+Dora layers + if self.target.use_dora.get(active_adapter, False): + raise ValueError("X-LoRA currently does not support LoRA layers with DoRA") + embedding_A = self.target.lora_embedding_A[active_adapter].T + embedding_B = self.target.lora_embedding_B[active_adapter].T + scaling = self.target.scaling[active_adapter] + after_A = self.target._embed(x, embedding_A) # type: ignore + if scalings is not None: + after_A_mod = self.apply_scalings_to_x(after_A, xlora_scalings, adapter_n) + scaling_weight = self.config.global_scaling_weight + else: + after_A_mod = after_A + scaling_weight = 1 + result += (after_A_mod @ embedding_B) * scaling * scaling_weight + + return result + + +class XLoraConv2dLayer(XLoraLayer): + def __init__( + self, + model: nn.Module, + target: lora.Conv2d, + target_forward: Callable[..., Any], + layer_number: int, + config: XLoraConfig, + ) -> None: + super().__init__(model, target, target_forward, layer_number, config) + + def forward(self, x: Tensor, *args: Any, scalings: Optional[Tensor] = None, **kwargs: Any) -> Tensor: + """ + This method is designed to be a drop-in-replacement for the LoRA layers' .forward method. To use it, a bound + method must be created (bound to an instance of the XLoraLayer class). + """ + + previous_dtype = x.dtype + + if scalings is not None: + xlora_scalings = self.get_maybe_topk_scalings(scalings) + + result = self.target.base_layer(x, *args, **kwargs) + + # Ignore if disabled. We want to make sure this is always run. + if not self.target.merged: + for adapter_n, active_adapter in enumerate(self.target.active_adapters): + if active_adapter not in self.target.lora_A.keys(): + continue + # TODO: implement X-LoRA with Lora+Dora layers + if self.target.use_dora[active_adapter]: + raise ValueError("X-LoRA currently does not support LoRA layers with DoRA") + lora_A = self.target.lora_A[active_adapter] + lora_B = self.target.lora_B[active_adapter] + dropout = self.target.lora_dropout[active_adapter] + scaling = self.target.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) # type: ignore + if scalings is not None: + x_mod = self.apply_scalings_to_x(x, xlora_scalings, adapter_n) + scaling_weight = self.config.global_scaling_weight + else: + x_mod = x + scaling_weight = 1 + result += lora_B(lora_A(dropout(x_mod))) * scaling * scaling_weight + + result = result.to(previous_dtype) + return result diff --git a/peft/src/peft/tuners/xlora/model.py b/peft/src/peft/tuners/xlora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..25e0902bfd0ea2580ee790116f2e5ffb1c1abd32 --- /dev/null +++ b/peft/src/peft/tuners/xlora/model.py @@ -0,0 +1,524 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +from contextlib import contextmanager +from functools import partial +from typing import Optional, Union + +import torch +import torch.nn as nn + +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.lora.model import LoraModel +from peft.tuners.tuners_utils import BaseTuner +from peft.utils.constants import DUMMY_TARGET_MODULES +from peft.utils.save_and_load import set_peft_model_state_dict + +from .. import lora +from .classifier import XLoraClassifier +from .config import XLoraConfig +from .layer import XLoraConv2dLayer, XLoraEmbeddingLayer, XLoraLinearLayer + + +def convert_layers_to_xlora( + base: nn.Module, # PeftModel + xloramodel: nn.Module, # XLoraModel + config: XLoraConfig, +) -> tuple[int, torch.device | None]: + """ + Returns the number of swapped layers. + """ + total_swapped = 0 + all_layers = [] + + device = None + for module in base.modules(): + # Check the exact type because classes like OPTLearnedPositionalEmbedding inherit from nn.Embedding + if isinstance(module, lora.Linear): + device = module.lora_A[next(iter(module.lora_A))].weight.device + new_layer = XLoraLinearLayer( + model=xloramodel, + target=module, + target_forward=module.forward, + layer_number=total_swapped, + config=config, + ) + all_layers.append(new_layer) + module.forward = new_layer.forward # type: ignore[method-assign] + total_swapped += 1 + elif isinstance(module, lora.Embedding): + device = module.lora_embedding_A[next(iter(module.lora_embedding_A))].device + new_layer = XLoraEmbeddingLayer( + model=xloramodel, + target=module, + target_forward=module.forward, + layer_number=total_swapped, + config=config, + ) + all_layers.append(new_layer) + module.forward = new_layer.forward # type: ignore[method-assign] + total_swapped += 1 + elif isinstance(module, lora.Conv2d): + device = module.lora_A[next(iter(module.lora_A))].weight.device + new_layer = XLoraConv2dLayer( + model=xloramodel, + target=module, + target_forward=module.forward, + layer_number=total_swapped, + config=config, + ) + all_layers.append(new_layer) + module.forward = new_layer.forward # type: ignore[method-assign] + total_swapped += 1 + + return (total_swapped, device) + + +def _load_adapter_into_lora_model( + lora_model: LoraModel, + adapter_name: str, + model_id: str, + torch_device: Optional[str] = None, + ephemeral_gpu_offload: bool = False, + autocast_adapter_dtype: bool = True, + subfolder: Optional[str] = None, + **kwargs, +): + """ + This method emulates the behavior of `PeftModel.from_pretrained`. Updates to `PeftModel.from_pretrained` may need + to be reflected here. + + All params pertain to the adapter (adapter name, model id, `i` is the adapter number in 0 indexing). + """ + from peft.peft_model import PeftModel + from peft.tuners.lora.config import LoraConfig + from peft.utils.other import infer_device + from peft.utils.save_and_load import load_peft_weights + + hf_hub_download_kwargs, kwargs = PeftModel._split_kwargs(kwargs) + if torch_device is None: + torch_device = infer_device() + + if adapter_name not in lora_model.peft_config: + # load the config + lora_peft_config = LoraConfig.from_pretrained( + model_id, + ephemeral_gpu_offload=ephemeral_gpu_offload, + subfolder=subfolder, + **hf_hub_download_kwargs, + ) + lora_peft_config.inference_mode = False + lora_model.peft_config[adapter_name] = lora_peft_config + lora_model.inject_adapter(lora_model.model, adapter_name) + + adapter_weights = load_peft_weights(model_id, device=torch_device, subfolder=subfolder, **hf_hub_download_kwargs) + new_adapter_weights = {} + # Rework the keys to contain the adapter numbers + for old_key in adapter_weights.keys(): + key: str = old_key + # Remove all the prefixes until we have model.<...> + while not (key.startswith("model.") and not key.startswith("model.model.")): + key = key[key.find(".") + 1 :] + # We always want model.model + key = "model." + key + new_adapter_weights[key] = adapter_weights[old_key] + + # load the weights into the model + ignore_mismatched_sizes = kwargs.get("ignore_mismatched_sizes", False) + load_result = set_peft_model_state_dict( + lora_model, + new_adapter_weights, + adapter_name=adapter_name, + ignore_mismatched_sizes=ignore_mismatched_sizes, + ) + if len(load_result.unexpected_keys) > 0: + raise ValueError( + f"Got unexpected keys! Please raise an issue and tag @EricLBuehler.\n\nunexpected_keys={load_result.unexpected_keys}" + ) + + if hasattr(lora_model, "_cast_adapter_dtype"): + lora_model._cast_adapter_dtype(adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype) + + +class XLoraModel(BaseTuner): + """ + Creates an X-LoRA (Mixture of LoRA experts), model from a pretrained transformers model. Currently, this X-LoRA + implementation only works with models with a transformer architecture. + + The method is described in detail in https://huggingface.co/papers/2402.07148. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`XLoraConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, does not affect the LoRA adapter names. + + Returns: + `torch.nn.Module`: The X-LoRA model. + + Example: + ```py + >>> from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig + >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training + + >>> model_config = AutoConfig.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") + >>> config = XLoraConfig( + ... task_type="CAUSAL_LM", + ... hidden_size=model_config.hidden_size, + ... xlora_depth=4, + ... adapters={ + ... "adapter_1": "./path/to/the/checkpoint/", + ... "adapter_2": "./path/to/the/checkpoint/", + ... "adapter_n": "./path/to/the/checkpoint/", + ... }, + ... ) + >>> int8_config = BitsAndBytesConfig(load_in_8bit=True) + >>> model = AutoModelForCausalLM.from_pretrained( + ... "mistralai/Mistral-7B-Instruct-v0.1", + ... trust_remote_code=True, + ... attn_implementation="flash_attention_2", + ... device_map="cuda:0", + ... torch_dtype=torch.bfloat16, + ... quantization_config=int8_config, + ... ) + >>> model = prepare_model_for_kbit_training(4) + >>> xlora_model = get_peft_model(model, config) + ``` + """ + + def __init__( + self, + model: nn.Module, + config: Union[dict[str, XLoraConfig], XLoraConfig], + adapter_name: str, + torch_device: Optional[str] = None, + ephemeral_gpu_offload: bool = False, + autocast_adapter_dtype: bool = True, + **kwargs, + ) -> None: + """ + Create a new X-LoRA model + + Args: + model (`nn.Module`): + Base model to apply X-LoRA to. + config: ([`XLoraConfig`]): + X-LoRA configuration object. + adapter_name: (`str`): + Adapter name for the X-LoRA adapter. + torch_device (`str`, *optional*, defaults to None): + (For loading the LoRA adapters) The device to load the adapter on. If `None`, the device will be + inferred. + ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`): + (For loading the LoRA adapters) Whether to use ephemeral GPU offloading for partially loaded modules. + Defaults to `False`. + autocast_adapter_dtype (`bool`, *optional*, defaults to `True`): + (For loading the LoRA adapters) Whether to autocast the adapter dtype. Defaults to `True`. Right now, + this will only cast adapter weights using float16 and bfloat16 to float32, as this is typically + required for stable training, and only affect select PEFT tuners. + kwargs: (`optional`): + (For loading the LoRA adapters) Additional arguments to modify the way the adapter is loaded, e.g. the + token for Hugging Face Hub. + """ + + nn.Module.__init__(self) + + if isinstance(config, dict): + conf = config[adapter_name] + else: + conf = config + + # Create an empty LoraModel + base_lora_config = copy.copy(conf) + base_lora_config.target_modules = DUMMY_TARGET_MODULES + # Imitate a LoraConfig, fields might need to be updated if LoraConfig is updated + base_lora_config.layer_replication = None + base_lora_config.bias = "none" + lora_model = LoraModel(model, base_lora_config, adapter_name) + + self.xlora_config = conf + self.lora_model = lora_model + + peft_config = conf + + if hasattr(model.config, "use_cache") and model.config.use_cache: + raise ValueError("`use_cache` must be False") + + adapters_items = peft_config.adapters.items() + if hasattr(self.xlora_config, "_subfolders"): + adapters_items = zip(peft_config.adapters.items(), self.xlora_config._subfolders) + else: + adapters_items = peft_config.adapters.items() + + if hasattr(self.xlora_config, "_subfolders"): + for i, (_adapter_name, model_id), subfolder in enumerate(adapters_items): + _load_adapter_into_lora_model( + lora_model=self.lora_model, + adapter_name=str(i), + model_id=model_id, + torch_device=torch_device, + ephemeral_gpu_offload=ephemeral_gpu_offload, + autocast_adapter_dtype=autocast_adapter_dtype, + subfolder=subfolder, + **kwargs, + ) + else: + for i, (_adapter_name, model_id) in enumerate(adapters_items): + _load_adapter_into_lora_model( + lora_model=self.lora_model, + adapter_name=str(i), + model_id=model_id, + torch_device=torch_device, + ephemeral_gpu_offload=ephemeral_gpu_offload, + autocast_adapter_dtype=autocast_adapter_dtype, + subfolder=None, + **kwargs, + ) + + self.lora_model.set_adapter(list(peft_config.adapters.keys())) + + self._maybe_freeze_all_adapters() + + total_swapped, device = convert_layers_to_xlora( + model, + self, + peft_config, + ) + + n_classes = len(peft_config.adapters) + xlora_classifier = XLoraClassifier(model, peft_config, n_classes, total_swapped, device) + + # Setup the model internal state + self.internal_xlora_classifier = xlora_classifier + self.internal_xlora_scalings = None # type: ignore + # Controlled by enable_adapter_layers or disable_adapter_layers + self.disabled = False + + def _maybe_freeze_all_adapters(self): + self.eval() + if not self.xlora_config.use_trainable_adapters: + for name, param in self.named_parameters(): + if "lora_" in name: + param.requires_grad = False + + def generate(self, *args, **kwargs): + kwargs["use_cache"] = False + res = self.lora_model.generate(*args, **kwargs) # type: ignore + # This is necessary because we use PeftModel.disable_adapter() which reenables the adapters + self._maybe_freeze_all_adapters() + return res + + @contextmanager + def _enable_peft_forward_hooks(self, *generate_args, **generate_kwargs): + def scalings_injection_hook(target, args, kwargs, scalings): + # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference + kwargs["scalings"] = scalings + return args, kwargs + + hook_handles = [] + + def _pre_forward(module, *args, **kwargs): + # =========================== Forward pass with "dummy" scalings ================== + nonlocal hook_handles + + args_real = args[0] + kwargs_real = args[1] + kwargs_real.update(kwargs) + + dummy_scalings = self.internal_xlora_classifier.make_dummy_scalings(*args_real, **kwargs_real) + + for module in self.modules(): + if isinstance(module, LoraLayer): + pre_forward = partial(scalings_injection_hook, scalings=dummy_scalings) + existing_hooks = getattr(module, "_forward_pre_hooks", {}) + if any(val is scalings_injection_hook for val in existing_hooks.values()): + # When calling generate, module.forward is called multiple times inside the forward hook + # context, resulting in multiple hooks being registered. Therefore, we check if the hooks is + # already present and skip it in that case. + continue + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + with torch.no_grad(): + self.lora_model.disable_adapter_layers() + + try: + scaling_pass_kwargs = kwargs_real.copy() + scaling_pass_kwargs["output_hidden_states"] = True + scaling_pass_kwargs["return_dict"] = True + try: + base_output = self.lora_model.model.forward(*args_real, **scaling_pass_kwargs) + finally: + # Clean everything up + for handle in hook_handles: + handle.remove() + finally: + self.lora_model.enable_adapter_layers() + + xlora_scalings = self.internal_xlora_classifier(result=base_output, *args_real, **kwargs_real) + # Store computed scalings to fix get_latest_scalings() returning None + self.internal_xlora_scalings = xlora_scalings + + # =========================== Real forward pass with calculated scalings ================== + + hook_handles = [] + for module in self.modules(): + if isinstance(module, LoraLayer): + pre_forward = partial(scalings_injection_hook, scalings=xlora_scalings) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + if not self.disabled: + forward_handle = self.lora_model.model.register_forward_pre_hook(_pre_forward, with_kwargs=True) + + # Run the forward pass: first the scaling pass in the hook, and then with the base model + try: + yield + finally: + if not self.disabled: + for handle in hook_handles: + handle.remove() + forward_handle.remove() + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "lora_model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.lora_model, name) + + @staticmethod + def _prepare_adapter_config(peft_config, _model_config): + # Handle X-LoRA case + return peft_config + + """ + Does nothing. X-LoRA needs adapters to be frozen. + """ + + def _mark_only_adapters_as_trainable(self) -> None: ... + + """ + This enables the X-LoRA adapter. + """ + + def enable_adapter_layers(self) -> None: + self.disabled = False + + """ + This diasables the X-LoRA adapter. + """ + + def disable_adapter_layers(self) -> None: + self.disabled = True + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + # Does nothing because XLoraModel has no target modules + pass + + @staticmethod + def _check_target_module_exists(lora_config, key): + # Does nothing because XLoraModel has no target modules + return False + + def forward(self, *args, **kwargs): + return self.lora_model.model(*args, **kwargs) + + def set_topk_lora(self, value: Optional[int]): + """ + Sparsely select the specified top_k LoRA experts instead of the default dense method. Set to None to use dense. + This is reflected in the config. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier.config.top_k_lora = value + + def set_global_scaling_weight(self, weight: float): + """ + Set the global LoRA weight, a scalar to multiply the output of each LoRA adapter by. This is by default 1. This + is reflected in the config. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier.config.global_scaling_weight = weight + + def set_scaling_pass_value(self, value: float | None): + """ + Set the scaling pass value, the value to set the scalings to during the scaling pass. If the value is None, the + scaling pass value will be 1/n where n is the number of adapters. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier._set_override_scaling_pass_value(value) + + def get_global_scaling_weight(self) -> float: + """ + Get the global LoRA weight. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + return classifier.config.global_scaling_weight + + def get_latest_scalings(self) -> Optional[torch.Tensor]: + """ + Returns the latest scalings prediction, or None if no scalings have been predicted. The tensor is of shape + (batch_size, seq_len, n_layers, n_classes). + """ + return self.internal_xlora_scalings + + def get_scalings_log(self) -> list[torch.Tensor]: + """ + Returns a shallow (only copying the list itself not the tensors) copy of the list containing the scalings log. + Editing the list does not change the underlying log. The tensors are of shape (batch_size, seq_len, n_layers, + n_classes). The seq_len dim may vary with input dimension. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + return classifier.log_scalings.copy() + + def enable_scalings_logging(self): + """ + Enable scalings logging. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier.scalings_logging = True + + def disable_scalings_logging(self): + """ + Disable scalings logging, without clearing the log. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier.scalings_logging = False + + def clear_scalings_log(self): + """ + Clear the scalings log. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + classifier.log_scalings.clear() + + def get_bucketed_scalings_log(self) -> dict[int, tuple[list[int], list[torch.Tensor]]]: + """ + Returns bucketed scalings, bucketed by seq_len. Each value consists of the positions (the first) and the + associated tensors. The positions are paired with the associated tensors and give the position in the scaling + log. + """ + classifier: XLoraClassifier = self.internal_xlora_classifier # type: ignore + return classifier._get_bucketed_scalings() diff --git a/peft/src/peft/utils/__init__.py b/peft/src/peft/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..781495465ec4dda4aa2d20fe6d12c8af9f9d3198 --- /dev/null +++ b/peft/src/peft/utils/__init__.py @@ -0,0 +1,130 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .integrations import map_cache_to_layer_device_map +from .loftq_utils import replace_lora_weights_loftq +from .other import ( + CONFIG_NAME, + INCLUDE_LINEAR_LAYERS_SHORTHAND, + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING, + WEIGHTS_NAME, + AuxiliaryTrainingWrapper, + ModulesToSaveWrapper, + TrainableTokensWrapper, + _freeze_adapter, + _get_batch_size, + _get_input_embeddings_name, + _get_submodules, + _is_valid_match, + _prepare_prompt_learning_config, + _set_adapter, + _set_trainable, + bloom_model_postprocess_past_key_value, + cast_mixed_precision_params, + get_auto_gptq_quant_linear, + get_gptqmodel_quant_linear, + get_quantization_config, + id_tensor_storage, + infer_device, + prepare_model_for_kbit_training, + set_additional_trainable_modules, + shift_tokens_right, + transpose, +) +from .peft_types import PeftType, TaskType, register_peft_method +from .save_and_load import get_peft_model_state_dict, load_peft_weights, set_peft_model_state_dict +from .warning import PeftWarning + + +__all__ = [ + "CONFIG_NAME", + "INCLUDE_LINEAR_LAYERS_SHORTHAND", + "SAFETENSORS_WEIGHTS_NAME", + "TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING", + "TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING", + "WEIGHTS_NAME", + "AuxiliaryTrainingWrapper", + "ModulesToSaveWrapper", + "PeftType", + "PeftWarning", + "TaskType", + "TrainableTokensWrapper", + "_freeze_adapter", + "_get_batch_size", + "_get_input_embeddings_name", + "_get_submodules", + "_is_valid_match", + "_prepare_prompt_learning_config", + "_set_adapter", + "_set_trainable", + "bloom_model_postprocess_past_key_value", + "cast_mixed_precision_params", + "get_auto_gptq_quant_linear", + "get_gptqmodel_quant_linear", + "get_peft_model_state_dict", + "get_quantization_config", + "id_tensor_storage", + "infer_device", + "load_peft_weights", + "map_cache_to_layer_device_map", + "prepare_model_for_kbit_training", + "register_peft_method", + "replace_lora_weights_loftq", + "set_additional_trainable_modules", + "set_peft_model_state_dict", + "shift_tokens_right", + "transpose", +] diff --git a/peft/src/peft/utils/constants.py b/peft/src/peft/utils/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..79c7d92b00f66cad731fa853e4abc182829f0f5a --- /dev/null +++ b/peft/src/peft/utils/constants.py @@ -0,0 +1,340 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import packaging.version +import torch +import transformers +from transformers import BloomPreTrainedModel + + +# needed for prefix-tuning of bloom model +def bloom_model_postprocess_past_key_value(past_key_values): + past_key_values = torch.cat(past_key_values) + total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape + keys = past_key_values[: total_layers // 2] + keys = keys.transpose(2, 3).reshape( + total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens + ) + values = past_key_values[total_layers // 2 :] + values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim) + + return tuple(zip(keys, values)) + + +# needed for prefix-tuning of StarCoder models +def starcoder_model_postprocess_past_key_value(past_key_values): + result = [] + for k in past_key_values: + k = k[:, :, 0] + k = k.permute([1, 2, 0, 3]) + k = k.reshape(*k.shape[:-2], -1) + result.append(k) + return tuple(result) + + +# TODO: remove this once transformers 4.53 is no longer supported +TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {} +transformers_le_4_53 = packaging.version.parse(transformers.__version__) < packaging.version.parse("4.54.0.dev0") +if transformers_le_4_53: + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING["gpt_bigcode"] = ( + starcoder_model_postprocess_past_key_value + ) + + +if hasattr(BloomPreTrainedModel, "_convert_to_standard_cache"): + # special handling for bloom architecture was fixed in: + # https://github.com/huggingface/transformers/pull/31445 + # the _convert_to_standard_cache method is removed in the PR and thus serves as an indicator + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING["bloom"] = bloom_model_postprocess_past_key_value + + +####################################### +# DEFAULT MAPPINGS FOR TARGET_MODULES # +####################################### + +TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "v"], + "mt5": ["q", "v"], + "bart": ["q_proj", "v_proj"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "blip-2": ["q", "v", "q_proj", "v_proj"], + "opt": ["q_proj", "v_proj"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "value"], + "xlm-roberta": ["query", "value"], + "electra": ["query", "value"], + "deberta-v2": ["query_proj", "value_proj"], + "deberta": ["in_proj"], + "layoutlm": ["query", "value"], + "llama": ["q_proj", "v_proj"], + "llama4": ["q_proj", "v_proj"], + "chatglm": ["query_key_value"], + "gpt_bigcode": ["c_attn"], + "mpt": ["Wqkv"], + "RefinedWebModel": ["query_key_value"], + "RefinedWeb": ["query_key_value"], + "falcon": ["query_key_value"], + "btlm": ["c_proj", "c_attn"], + "codegen": ["qkv_proj"], + "mistral": ["q_proj", "v_proj"], + "mixtral": ["q_proj", "v_proj"], + "stablelm": ["q_proj", "v_proj"], + "phi": ["q_proj", "v_proj", "fc1", "fc2"], + "gemma": ["q_proj", "v_proj"], + "gemma2": ["q_proj", "v_proj"], + "gemma3_text": ["q_proj", "v_proj"], + "qwen2": ["q_proj", "v_proj"], + "qwen3": ["q_proj", "v_proj"], +} + +# target module mappings that are identical to LORA +TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() + +# mappings that are similar to LORA with small changes +TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING["gpt_bigcode"] = ["mlp.c_proj"] +TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING["gpt2"] = ["mlp.c_proj"] + +TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING["phi"] = ["q_proj", "v_proj"] + +TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING["phi"] = ["q_proj", "v_proj"] + +TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.copy() +TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING["gpt_bigcode"] = ["mlp.c_proj"] +TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING["gpt2"] = ["mlp.c_proj"] + +# target module mappings that differ from LORA +TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING = { + "llama": ["input_layernorm", "post_attention_layernorm", "norm"], + "bloom": ["input_layernorm", "post_attention_layernorm", "ln_f"], + "llava": [ + "multi_modal_projector", + "input_layernorm", + "post_attention_layernorm", + "norm", + "embed_tokens", + "lm_head", + ], + "t5": ["layer_norm", "final_layer_norm"], + "mt5": ["layer_norm", "final_layer_norm"], + "bart": ["self_attn_layer_norm", "encoder_attn_layer_norm", "final_layer_norm"], + "gpt2": ["ln_1", "ln_2", "ln_f"], + "blip-2": ["layernorm", "LayerNorm", "final_layer_norm", "self_attn_layer_norm"], + "gptj": ["ln_1", "ln_f"], + "falcon": ["input_layernorm", "post_attention_layernorm", "ln_f"], + "mistral": ["input_layernorm", "post_attention_layernorm", "norm"], + "phi": ["input_layernorm", "final_layernorm"], + "gemma": ["input_layernorm", "post_attention_layernorm", "norm"], + "gemma2": [ + "input_layernorm", + "post_attention_layernorm", + "pre_feedforward_layernorm", + "post_feedforward_layernorm", + "norm", + ], + "gemma3_text": [ + "input_layernorm", + "post_attention_layernorm", + "pre_feedforward_layernorm", + "post_feedforward_layernorm", + "norm", + ], + "qwen2": ["post_attention_layernorm"], + "qwen3": ["post_attention_layernorm"], +} + +TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = { + "t5": ["k", "v", "wo"], + "mt5": ["k", "v", "wi_1"], + "gpt2": ["c_attn", "mlp.c_proj"], + "bloom": ["query_key_value", "mlp.dense_4h_to_h"], + "roberta": ["key", "value", "output.dense"], + "opt": ["q_proj", "k_proj", "fc2"], + "gptj": ["q_proj", "v_proj", "fc_out"], + "gpt_neox": ["query_key_value", "dense_4h_to_h"], + "gpt_neo": ["q_proj", "v_proj", "c_proj"], + "bart": ["q_proj", "v_proj", "fc2"], + "gpt_bigcode": ["c_attn", "mlp.c_proj"], + "llama": ["k_proj", "v_proj", "down_proj"], + "llama4": ["q_proj", "v_proj", "down_proj"], + "mistral": ["k_proj", "v_proj", "down_proj"], + "mixtral": ["k_proj", "v_proj", "w2"], + "bert": ["key", "value", "output.dense"], + "deberta-v2": ["key_proj", "value_proj", "output.dense"], + "deberta": ["in_proj", "output.dense"], + "RefinedWebModel": ["query_key_value", "dense_4h_to_h"], + "RefinedWeb": ["query_key_value", "dense_4h_to_h"], + "falcon": ["query_key_value", "dense_4h_to_h"], + "phi": ["q_proj", "v_proj", "fc2"], + "gemma": ["q_proj", "v_proj", "down_proj"], + "gemma2": ["q_proj", "v_proj", "down_proj"], + "gemma3_text": ["q_proj", "v_proj", "down_proj"], + "qwen2": ["q_proj", "v_proj", "down_proj"], + "qwen3": ["q_proj", "v_proj", "down_proj"], +} + +TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = { + "t5": ["wo"], + "mt5": [], + "gpt2": ["mlp.c_proj"], + "bloom": ["mlp.dense_4h_to_h"], + "roberta": ["output.dense"], + "opt": ["fc2"], + "gptj": ["fc_out"], + "gpt_neox": ["dense_4h_to_h"], + "gpt_neo": ["c_proj"], + "bart": ["fc2"], + "gpt_bigcode": ["mlp.c_proj"], + "llama": ["down_proj"], + "llama4": ["down_proj"], + "mistral": ["down_proj"], + "mixtral": ["w2"], + "bert": ["output.dense"], + "deberta-v2": ["output.dense"], + "deberta": ["output.dense"], + "RefinedWeb": ["dense_4h_to_h"], + "RefinedWebModel": ["dense_4h_to_h"], + "falcon": ["dense_4h_to_h"], + "phi": ["fc2"], + "gemma": ["down_proj"], + "gemma2": ["down_proj"], + "gemma3_text": ["down_proj"], + "qwen2": ["down_proj"], + "qwen3": ["down_proj"], +} + +TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "k", "v", "o", "wi", "wo"], + "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], + "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "llama": ["q_proj", "v_proj"], + "llama4": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "key", "value", "dense"], + # "xlm-roberta": ["query", "value"], + # "electra": ["query", "value"], + "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], + "gpt_bigcode": ["c_attn"], + "deberta": ["in_proj"], + # "layoutlm": ["query", "value"], + "gemma": ["q_proj", "v_proj"], + "gemma2": ["q_proj", "v_proj"], + "gemma3_text": ["q_proj", "v_proj"], + "qwen2": ["q_proj", "v_proj"], + "qwen3": ["q_proj", "v_proj"], +} + +TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "k", "v", "o", "wi", "wo"], + "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], + "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "llama": ["q_proj", "v_proj"], + "llama4": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "value"], + "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], + "gpt_bigcode": ["c_attn"], + "deberta": ["in_proj"], + "gemma": ["q_proj", "v_proj"], + "gemma2": ["q_proj", "v_proj"], + "gemma3_text": ["q_proj", "v_proj"], + "qwen2": ["q_proj", "v_proj"], + "qwen3": ["q_proj", "v_proj"], +} + +################## +# MISC CONSTANTS # +################## + +TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING = { + "t5": ["q", "v"], + "mt5": ["q", "v"], + "bart": ["q_proj", "v_proj"], + "gpt2": ["mlp.c_proj"], + "bloom": ["query_key_value"], + "blip-2": ["q", "v", "q_proj", "v_proj"], + "opt": ["q_proj", "v_proj"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "value"], + "xlm-roberta": ["query", "value"], + "electra": ["query", "value"], + "deberta-v2": ["query_proj", "value_proj"], + "deberta": ["in_proj"], + "layoutlm": ["query", "value"], + "llama": ["q_proj", "v_proj"], + "llama4": ["q_proj", "v_proj"], + "chatglm": ["query_key_value"], + "gpt_bigcode": ["mlp.c_proj"], + "mpt": ["Wqkv"], + "RefinedWebModel": ["query_key_value"], + "RefinedWeb": ["query_key_value"], + "falcon": ["query_key_value"], + "codegen": ["qkv_proj"], + "mistral": ["q_proj", "v_proj"], + "mixtral": ["q_proj", "v_proj"], + "stablelm": ["q_proj", "v_proj"], + "phi": ["q_proj", "v_proj", "fc1", "fc2"], + "gemma": ["q_proj", "v_proj"], + "gemma2": ["q_proj", "v_proj"], + "gemma3_text": ["q_proj", "v_proj"], + "qwen2": ["q_proj", "v_proj"], + "qwen3": ["q_proj", "v_proj"], +} + +WEIGHTS_NAME = "adapter_model.bin" +SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors" +CONFIG_NAME = "adapter_config.json" +EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"] +SEQ_CLS_HEAD_NAMES = ["score", "classifier"] +INCLUDE_LINEAR_LAYERS_SHORTHAND = "all-linear" +TOKENIZER_CONFIG_NAME = "tokenizer_config.json" +DUMMY_TARGET_MODULES = "dummy-target-modules" +DUMMY_MODEL_CONFIG = {"model_type": "custom"} + +# If users specify more than this number of target modules, we apply an optimization to try to reduce the target modules +# to a minimal set of suffixes, which makes loading faster. We only apply this when exceeding a certain size since +# otherwise there is no point in optimizing and there is a small chance of bugs in the optimization algorithm, so no +# point in taking unnecessary risks. See #2045 for more context. +MIN_TARGET_MODULES_FOR_OPTIMIZATION = 20 diff --git a/peft/src/peft/utils/hotswap.py b/peft/src/peft/utils/hotswap.py new file mode 100644 index 0000000000000000000000000000000000000000..2b08c372e132b1084c10df79ab35a42f2cdf19ef --- /dev/null +++ b/peft/src/peft/utils/hotswap.py @@ -0,0 +1,630 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import warnings +from operator import attrgetter +from typing import Literal, Optional + +import torch + +from peft.config import PeftConfig +from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING +from peft.tuners.lora import Conv2d, Linear, LoraConfig, LoraLayer + +from .other import get_pattern_key, infer_device +from .peft_types import PeftType +from .save_and_load import _insert_adapter_name_into_state_dict, load_peft_weights + + +# so far only LoRA is supported +CONFIG_KEYS_TO_CHECK = {PeftType.LORA: ["use_rslora", "lora_dropout", "alpha_pattern", "use_dora"]} + + +def _update_scaling(lora_module, adapter_name, scaling=None): + """ + Update the value of the scalings of the LoRA module. + + Takes into consideration that scalings can be tensors from prepare_model_for_compiled_hotswap. + """ + if lora_module.scaling[adapter_name] == scaling: + return + + if isinstance(lora_module.scaling[adapter_name], torch.Tensor): + lora_module.scaling[adapter_name].fill_(scaling) + elif isinstance(lora_module.scaling[adapter_name], (float, int)): + lora_module.scaling[adapter_name] = scaling + else: + raise ValueError( + "Something went wrong when trying to set the new scale value, expected to find the old value to be of type " + f"float or torch.Tensor, got {type(lora_module.scaling[adapter_name])} instead." + ) + + +def _convert_scalings_to_tensor(model) -> bool: + """ + Convert the LoRA scaling values into torch.tensors to prevent recompilation if they change. + + Returns: + bool: + Returns `True` if an appropriate adapter was found, else `False`. + """ + found_adapter = False + for module in model.modules(): + if not isinstance(module, LoraLayer): + continue + + found_adapter = True + scaling = module.scaling + for key, val in scaling.items(): + if isinstance(val, float): + # no need to deal with dtype as scalars are coerced + scaling[key] = torch.tensor(val, device=module.weight.device) + elif not isinstance(val, torch.Tensor): + raise ValueError( + "Something went wrong while trying to convert the scalings, expected to find values of type float " + f"but found {type(val)} instead." + ) + return found_adapter + + +def _get_padded_linear(lora_module: torch.nn.Module, target_rank: int, is_lora_A: bool) -> torch.nn.Linear: + """ + Get a new Linear layer for LoRA with padded weights according to the target rank. + + Args: + lora_module (nn.Module): + The LoRA sub-module (e.g. module.lora_A[adapter_name]). + target_rank (int): + The desired rank to pad to. + is_lora_A (bool): + True if this is the LoRA A matrix, False if LoRA B. + + Returns: + nn.Linear: + A newly created and padded Linear layer. If the rank already fit, the original layer is returned. + """ + weight = lora_module.weight + # For LoRA A, the "rank dimension" is weight.size(0) (out_features). + # For LoRA B, it is weight.size(1) (in_features). + original_rank = weight.size(0) if is_lora_A else weight.size(1) + + # If no padding needed + if original_rank == target_rank: + return lora_module + + if original_rank > target_rank: + raise ValueError( + f"Trying to pad the adapter to the target rank {target_rank}, but the original rank is larger " + f"({original_rank}). This is not possible." + ) + + out_features, in_features = weight.shape + + # lora_A and lora_B are always nn.Linear + if is_lora_A: + # LoRA A affects out_features + padded = torch.zeros(target_rank, in_features, device=weight.device, dtype=weight.dtype) + padded[:original_rank, :] = weight + new_layer = torch.nn.Linear(in_features, target_rank, bias=lora_module.bias is not None) + else: + # LoRA B affects in_features + padded = torch.zeros(out_features, target_rank, device=weight.device, dtype=weight.dtype) + padded[:, :original_rank] = weight + new_layer = torch.nn.Linear(target_rank, out_features, bias=lora_module.bias is not None) + + # Sanity check + if new_layer.weight.shape != padded.shape: + raise ValueError( + "Something went wrong when trying to pad the LoRA Linear weights, the new shape should be " + f"{padded.shape} but {new_layer.weight.shape} was found. Please open an issue on PEFT " + "(https://github.com/huggingface/peft/issues) and report this error." + ) + if (lora_module.bias is not None) and (new_layer.bias.shape != lora_module.bias.shape): + raise ValueError( + "Something went wrong when trying to pad the LoRA Linear bias, the new shape should be " + f"{lora_module.bias.shape} but {new_layer.bias.shape} was found. Please open an issue on PEFT " + "(https://github.com/huggingface/peft/issues) and report this error." + ) + + new_layer.weight.data = padded + # Copy bias if present + if lora_module.bias is not None: + new_layer.bias.data = lora_module.bias.data + + return new_layer + + +def _get_padded_conv2d(lora_module: torch.nn.Module, target_rank: int, is_lora_A: bool) -> torch.nn.Conv2d: + """ + Get a new Conv2d layer for LoRA with padded weights according to the target rank. + + Args: + lora_module (nn.Module): + The LoRA sub-module (e.g. module.lora_A[adapter_name]). + target_rank (int): + The desired rank to pad to. + is_lora_A (bool): + True if this is the LoRA A matrix, False if LoRA B. + + Returns: + nn.Conv2d: + A newly created and padded Conv2d layer. If the rank already fit, the original layer is returned. + """ + weight = lora_module.weight + # For Conv2d: [out_channels, in_channels, kernel_height, kernel_width] + out_channels, in_channels, kh, kw = weight.shape + original_rank = out_channels if is_lora_A else in_channels + + if original_rank == target_rank: + return lora_module + + if original_rank > target_rank: + raise ValueError( + f"Trying to pad the adapter to the target rank {target_rank}, but the original rank is larger " + f"({original_rank}). This is not possible." + ) + + # lora_A and lora_B are always nn.Conv2d + if is_lora_A: + # LoRA A affects out_channels + padded = torch.zeros(target_rank, in_channels, kh, kw, device=weight.device, dtype=weight.dtype) + padded[:out_channels, :, :, :] = weight + new_layer = torch.nn.Conv2d( + in_channels, + target_rank, + kernel_size=lora_module.kernel_size, + stride=lora_module.stride, + padding=lora_module.padding, + bias=lora_module.bias is not None, + groups=lora_module.groups, + ) + else: + # LoRA B affects in_channels + padded = torch.zeros(out_channels, target_rank, kh, kw, device=weight.device, dtype=weight.dtype) + padded[:, :in_channels, :, :] = weight + new_layer = torch.nn.Conv2d( + target_rank, + out_channels, + kernel_size=lora_module.kernel_size, + stride=lora_module.stride, + padding=lora_module.padding, + bias=lora_module.bias is not None, + groups=lora_module.groups, + ) + + # Sanity check + if new_layer.weight.shape != padded.shape: + raise ValueError( + "Something went wrong when trying to pad the LoRA weights, the new shape should be " + f"{padded.shape} but {new_layer.weight.shape} was found. Please open an issue on PEFT " + "(https://github.com/huggingface/peft/issues) and report this error." + ) + if (lora_module.bias is not None) and (new_layer.bias.shape != lora_module.bias.shape): + raise ValueError( + "Something went wrong when trying to pad the LoRA Conv2d bias, the new shape should be " + f"{lora_module.bias.shape} but {new_layer.bias.shape} was found. Please open an issue on PEFT " + "(https://github.com/huggingface/peft/issues) and report this error." + ) + + new_layer.weight.data = padded + # Copy bias if present + if lora_module.bias is not None: + new_layer.bias.data = lora_module.bias.data + + return new_layer + + +def _pad_lora_weights(model: torch.nn.Module, target_rank: int) -> bool: + """ + Pad LoRA weights in a model to a target rank while preserving the original behavior. + + Args: + model (nn.Module): The model containing LoRA modules (with lora_A and lora_B). + target_rank (int): The target rank to pad to. + + Returns: + bool: + Returns `True` if an appropriate adapter was found, else `False`. + """ + found_adapter = False + + for module in model.modules(): + # Decide which pad function to call based on module type + if isinstance(module, Linear): + pad_fn = _get_padded_linear + elif isinstance(module, Conv2d): + pad_fn = _get_padded_conv2d + else: + # Skip any other module types + continue + + # Pad LoRA A + for adapter_name, lora_A_module in module.lora_A.items(): + new_layer = pad_fn(lora_A_module, target_rank=target_rank, is_lora_A=True) + module.lora_A[adapter_name] = new_layer + + # Pad LoRA B + for adapter_name, lora_B_module in module.lora_B.items(): + new_layer = pad_fn(lora_B_module, target_rank=target_rank, is_lora_A=False) + module.lora_B[adapter_name] = new_layer + + found_adapter = True + return found_adapter + + +def prepare_model_for_compiled_hotswap( + model: torch.nn.Module, + *, + target_rank: Optional[int] = None, + config: Optional[LoraConfig | dict[str, LoraConfig]] = None, + check_compiled: Literal["error", "warn", "ignore"] = "error", +) -> None: + """ + Helper function that prepares the model so that it can later be compiled and then used with hot-swapping. + + It is necessary to call this function on the model for hot-swapping to work if both of these are true: + + - the different LoRA adapters have different ranks and/or different alpha values (i.e. scalings) + - you plan to torch.compile the model and want to avoid re-compilation + + It is important to call this function *after* the first LoRA adapter has been loaded (i.e. the one that will be + swapped out) but *before* the model is compiled. + + Even with this function, hot-swapping LoRA adapters that target different layers is still not supported. + + Note: This function modifies the model in-place. If you want to restore the model to its initial state, you will + have to reload it. + + Args: + model (`nn.Module`): + The model with the loaded adapter, before compilation. + target_rank (`int`, *optional*): + The target rank to pad the LoRA weights to. Should be the maximum rank among all LoRA adapters that will be + hot-swapped. If not specified, the target ranks will not be changed. + config (`LoraConfig` or `dict[str, LoraConfig]`, *optional*): + Optionally pass the `LoraConfig`s of the LoRA adapters. If passed, the rank in the configs will be updated + to `target_rank`. + check_compiled (`str`, *optional*, defaults to `"error"`): + How to handle the case when the model is already compiled, which should generally be avoided. The options + are: + - "error" (default): raise an error + - "warn": issue a warning + - "ignore": do nothing + + Raises: + ValueError + If the model is already compiled or if no adpater layer was found, raise an error. + + Example: + + ```py + base_model = ... + model = PeftModel.from_pretrained(base_model, path_adapter_0) + # Prepare the model to allow hotswapping even if ranks/scalings of 2nd adapter differ. + # You can skip this step if all ranks and scalings are identical. + prepare_model_for_compiled_hotswap(model, target_rank=highest_lora_rank) + model = torch.compile(model) + # do inference with adapter 0 + # replace the "default" lora adapter with the new one + hotswap_adapter(model, path_adapter_1, adapter_name="default", torch_device=device) + # do inference with adapter 1 + ``` + + """ + is_compiled = hasattr(model, "_orig_mod") or getattr(model, "_compiled_call_impl", False) + if is_compiled: + if check_compiled == "error": + raise ValueError("Call prepare_model_for_compiled_hotswap *before* compiling the model") + elif check_compiled == "warn": + warnings.warn( + "prepare_model_for_compiled_hotswap was called with a model that is already compiled. This will likely " + "result in re-compilation, hurting performance. Call the function before compiling the model." + ) + elif check_compiled != "ignore": + raise ValueError( + f"check_compiles should be one of 'error', 'warn', or 'ignore', got '{check_compiled}' instead." + ) + + conversion_found_adapter = _convert_scalings_to_tensor(model) + if target_rank is not None: + padding_found_adapter = _pad_lora_weights(model, target_rank=target_rank) + else: + padding_found_adapter = False + + if not (conversion_found_adapter or padding_found_adapter): + raise ValueError( + "No adapter layers found on the model, make sure call `prepare_model_for_compiled_hotswap` after loading " + "the first adapter and before loading the second adapter." + ) + + if not config: + return + if target_rank is None: + return + + if not isinstance(config, dict): + # config can be either a PeftConfig, or a dict of PeftConfigs like PeftModel.peft_config + config = {"dummy": config} + + for lora_config in config.values(): + lora_config.r = target_rank + if lora_config.rank_pattern: + for key in lora_config.rank_pattern: + lora_config.rank_pattern[key] = target_rank + + +def hotswap_adapter_from_state_dict( + model: torch.nn.Module, + state_dict: dict[str, torch.Tensor], + adapter_name: str, + config: LoraConfig, + parameter_prefix: str = "lora_", +): + """ + Swap out the adapter weights from the model with the weights from state_dict. + + As of now, only LoRA is supported. + + This is a low-level function that assumes that the adapters have been checked for compatibility and that the + state_dict has been correctly mapped to work with PEFT. For a high level function that performs this work for you, + use `hotswap_adapter` instead. + + Args: + model (`nn.Module`): + The model with the loaded adapter. + state_dict (`dict[str, torch.Tensor]`): + The state dict of the new adapter, which needs to be compatible (targeting same modules etc.). + adapter_name (`str`): + The name of the adapter that should be hot-swapped, e.g. `"default"`. The name will remain the same after + swapping. + config (`LoraConfig`): + The config of the LoRA adapter. This is used to determine the scaling and rank of the adapter. + parameter_prefix (`str`, *optional*, defaults to `"lora_"`) + The prefix used to identify the adapter's keys in the state dict. For LoRA, this would be `"lora_"` (the + default). + + Raises: + RuntimeError + If the old and the new adapter are not compatible, a RuntimeError is raised. + + """ + # Ensure that all the keys of the new adapter correspond exactly to the keys of the old adapter, otherwise + # hot-swapping is not possible + + # _orig_mod is for torch.compile(model) and _compiled_call_impl is for model.compile() (not wrapped) + is_compiled = hasattr(model, "_orig_mod") + is_compiled_inplace = bool(getattr(model, "_compiled_call_impl", None)) + # TODO: there is probably a more precise way to identify the adapter keys + missing_keys = {k for k in model.state_dict() if (parameter_prefix in k) and (adapter_name in k)} + unexpected_keys = [] + + # first: dry run, not swapping anything + for key, new_val in state_dict.items(): + try: + old_val = attrgetter(key)(model) + except AttributeError: + unexpected_keys.append(key) + continue + + if is_compiled: + missing_keys.remove("_orig_mod." + key) + else: + missing_keys.remove(key) + + # Right now, we don't deal with unexpected keys, i.e. if the adapter being swapped in targeting new layers. We could + # probably add LoRA to these layers ad hoc, but that would not work with compiled models. + if unexpected_keys: + msg = f"Hot swapping the adapter did not succeed, unexpected keys found: {', '.join(unexpected_keys)}." + raise RuntimeError(msg) + + # If the adapter that is being swapped in is missing some keys, this is fine. We just need to ensure that those LoRA + # weights from the previous adapter are set to 0 so that they don't influence the output. We don't need to worry + # about ranks are alphas. + for key in missing_keys: + # in case it's a compiled model + key = key.removeprefix("_orig_mod.") + # get LoRA parent module name by removing the 'lora_*..weight' part + module_name = ".".join(key.split(".")[:-3]) + module = model.get_submodule(module_name) + old_val = attrgetter(key)(model) + old_val.data.fill_(0.0) + + # actual swapping + for key, new_val in state_dict.items(): + # get LoRA parent module name by removing the 'lora_*..weight' part + module_name = ".".join(key.split(".")[:-3]) + module = model.get_submodule(module_name) + + # swap alpha/scaling + r_key = get_pattern_key(config.rank_pattern.keys(), key) + alpha_key = get_pattern_key(config.alpha_pattern.keys(), key) + rank = config.rank_pattern.get(r_key, config.r) + alpha = config.alpha_pattern.get(alpha_key, config.lora_alpha) + if config.use_rslora: + scaling = alpha / math.sqrt(rank) + else: + scaling = alpha / rank + _update_scaling(module, adapter_name=adapter_name, scaling=scaling) + + # swap actual weights + # no need to account for potential _orig_mod in key here, as torch handles that + old_val = attrgetter(key)(model) + new_val = new_val.to(old_val.data.device) + + # We try to detect if the model is compiled but it does not always work, e.g. if hotswapping is called from + # within the model itself. In this case, swap_tensors raises RuntimeError and should continue without + # swap_tensors. + if not is_compiled and not is_compiled_inplace: + try: + torch.utils.swap_tensors(old_val, new_val) + continue + except RuntimeError: + is_compiled = True + + # Compiled models don't work with swap_tensors because there are weakrefs for the tensor. It is unclear if + # this workaround could not cause trouble but the tests indicate that it works. + if old_val.shape == new_val.shape: + # either + # - adapters had the same rank + # - adapters were padded with prepare_model_for_compiled_hotswap and 2nd adapter was larger + old_val.data.copy_(new_val.data) + else: + # if 2nd adapter was smaller, ensure to fill up to adapter dimension and set the rest to zeros + if old_val.dim() not in (2, 4): + raise NotImplementedError( + f"Trying to hotswap an adapter whose weight has {old_val.dim()} dimensions, but only Conv2d and " + "Linear are supported" + ) + + # Linear or Conv2d: the check for dim 0 or 1 works for both of these layer types + if old_val.shape[0] > new_val.shape[0]: + old_val.data.fill_(0) + old_val.data[: new_val.shape[0]].copy_(new_val.data) + elif old_val.shape[1] > new_val.shape[1]: + old_val.data.fill_(0) + old_val.data[:, : new_val.shape[1]].copy_(new_val.data) + else: + raise ValueError( + f"Incompatible shapes found for LoRA weights {key}: {old_val.shape} vs {new_val.shape}. Please " + "ensure that all ranks are padded to the largest rank among all LoRA adapters by using " + "peft.utils.hotswap.prepare_model_for_compiled_hotswap." + ) + + +def check_hotswap_configs_compatible(config0: PeftConfig, config1: PeftConfig) -> None: + """ + Check if two configs are compatible for hot-swapping. + + Only LoRA parameters are checked for now. + + To hot-swap two adapters, their configs must be compatible. Otherwise, the results could be false. E.g. if they use + different alpha values, after hot-swapping, the alphas from the first adapter would still be used with the weights + from the 2nd adapter, which would result in incorrect behavior. There is probably a way to swap these values as + well, but that's not implemented yet, and we need to be careful not to trigger re-compilation if the model is + compiled (so no modification of the dict). + + """ + + if config0.peft_type != config1.peft_type: + msg = f"Incompatible PEFT types found: {config0.peft_type.value} and {config1.peft_type.value}" + raise ValueError(msg) + + if config0.peft_type not in CONFIG_KEYS_TO_CHECK: + msg = ( + f"Hotswapping only supports {', '.join(CONFIG_KEYS_TO_CHECK.keys())} but " + f"{config0.peft_type.value} was passed." + ) + raise ValueError(msg) + config_keys_to_check = CONFIG_KEYS_TO_CHECK[config0.peft_type] + + # TODO: This is a very rough check only for LoRA at the moment. Also, there might be some options that don't + # necessarily require an error. + config0 = config0.to_dict() + config1 = config1.to_dict() + sentinel = object() + for key in config_keys_to_check: + val0 = config0.get(key, sentinel) + val1 = config1.get(key, sentinel) + if val0 != val1: + raise ValueError(f"Configs are incompatible: for {key}, {val0} != {val1}") + + +def hotswap_adapter(model, model_name_or_path, adapter_name, torch_device=None, **kwargs): + """Substitute old adapter data with new adapter data, keeping the rest the same. + + As of now, only LoRA is supported. + + This function is useful when you want to replace the loaded adapter with a new adapter. The adapter name will + remain the same, but the weights and other parameters will be swapped out. + + If the adapters are incomptabile, e.g. targeting different layers or having different alpha values, an error will + be raised. + + Example: + + ```py + >>> import torch + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + >>> from peft.utils.hotswap import hotswap_adapter + + >>> model_id = ... + >>> inputs = ... + >>> device = ... + >>> model = AutoModelForCausalLM.from_pretrained(model_id).to(device) + + >>> # load lora 0 + >>> model = PeftModel.from_pretrained(model, "path-adapter-0") + >>> model = torch.compile(model) # optionally compile the model + >>> with torch.inference_mode(): + ... output_adapter_0 = model(inputs) + + >>> # replace the "default" lora adapter with the new one + >>> hotswap_adapter(model, "path-adapter-1", adapter_name="default", torch_device=device) + >>> with torch.inference_mode(): + ... output_adapter_1 = model(inputs).logits + ``` + + Args: + model ([`~PeftModel`]): + The PEFT model with the loaded adapter. + model_name_or_path (`str`): + The name or path of the model to load the new adapter from. + adapter_name (`str`): + The name of the adapter to swap, e.g. `"default"`. The name will stay the same after swapping. + torch_device: (`str`, *optional*, defaults to None): + The device to load the new adapter onto. + **kwargs (`optional`): + Additional keyword arguments used for loading the config and weights. + + """ + if torch_device is None: + torch_device = infer_device() + + ############################ + # LOAD CONFIG AND VALIDATE # + ############################ + hf_kwargs = { + "subfolder": kwargs.get("subfolder", None), + "revision": kwargs.get("revision", None), + "cache_dir": kwargs.get("cache_dir", None), + "token": kwargs.get("token", None), + } + if use_auth_token := kwargs.get("use_auth_token", None): + hf_kwargs["use_auth_token"] = use_auth_token + config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig._get_peft_type(model_name_or_path, **hf_kwargs)] + config = config_cls.from_pretrained(model_name_or_path, **kwargs) + # config keys that could affect the model output besides what is determined by the state_dict + check_hotswap_configs_compatible(model.active_peft_config, config) + + state_dict = load_peft_weights(model_name_or_path, device=torch_device, **kwargs) + + ########################### + # LOAD & REMAP STATE_DICT # + ########################### + + parameter_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + peft_model_state_dict = _insert_adapter_name_into_state_dict( + state_dict, adapter_name=adapter_name, parameter_prefix=parameter_prefix + ) + + hotswap_adapter_from_state_dict( + model=model, + state_dict=peft_model_state_dict, + adapter_name=adapter_name, + parameter_prefix=parameter_prefix, + config=config, + ) diff --git a/peft/src/peft/utils/incremental_pca.py b/peft/src/peft/utils/incremental_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..de4a7c05174dc436f4c75965ef9585afb480183c --- /dev/null +++ b/peft/src/peft/utils/incremental_pca.py @@ -0,0 +1,338 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch + + +class IncrementalPCA: + """ + An implementation of Incremental Principal Components Analysis (IPCA) that leverages PyTorch for GPU acceleration. + Adapted from https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/decomposition/_incremental_pca.py + + This class provides methods to fit the model on data incrementally in batches, and to transform new data based on + the principal components learned during the fitting process. + + Args: + n_components (int, optional): Number of components to keep. If `None`, it's set to the minimum of the + number of samples and features. Defaults to None. + copy (bool): If False, input data will be overwritten. Defaults to True. + batch_size (int, optional): The number of samples to use for each batch. Only needed if self.fit is called. + If `None`, it's inferred from the data and set to `5 * n_features`. Defaults to None. + svd_driver (str, optional): name of the cuSOLVER method to be used for torch.linalg.svd. This keyword + argument only works on CUDA inputs. Available options are: None, gesvd, gesvdj, and gesvda. Defaults to + None. + lowrank (bool, optional): Whether to use torch.svd_lowrank instead of torch.linalg.svd which can be faster. + Defaults to False. + lowrank_q (int, optional): For an adequate approximation of n_components, this parameter defaults to + n_components * 2. + lowrank_niter (int, optional): Number of subspace iterations to conduct for torch.svd_lowrank. + Defaults to 4. + lowrank_seed (int, optional): Seed for making results of torch.svd_lowrank reproducible. + """ + + def __init__( + self, + n_components: Optional[int] = None, + copy: Optional[bool] = True, + batch_size: Optional[int] = None, + svd_driver: Optional[str] = None, + lowrank: bool = False, + lowrank_q: Optional[int] = None, + lowrank_niter: int = 4, + lowrank_seed: Optional[int] = None, + ): + self.n_components = n_components + self.copy = copy + self.batch_size = batch_size + self.svd_driver = svd_driver + self.lowrank = lowrank + self.lowrank_q = lowrank_q + self.lowrank_niter = lowrank_niter + self.lowrank_seed = lowrank_seed + + self.n_features_ = None + + if self.lowrank: + self._validate_lowrank_params() + + def _validate_lowrank_params(self): + if self.lowrank_q is None: + if self.n_components is None: + raise ValueError("n_components must be specified when using lowrank mode with lowrank_q=None.") + self.lowrank_q = self.n_components * 2 + elif self.lowrank_q < self.n_components: + raise ValueError("lowrank_q must be greater than or equal to n_components.") + + def _svd_fn_full(self, X): + return torch.linalg.svd(X, full_matrices=False, driver=self.svd_driver) + + def _svd_fn_lowrank(self, X): + seed_enabled = self.lowrank_seed is not None + with torch.random.fork_rng(enabled=seed_enabled): + if seed_enabled: + torch.manual_seed(self.lowrank_seed) + U, S, V = torch.svd_lowrank(X, q=self.lowrank_q, niter=self.lowrank_niter) + return U, S, V.mH + + def _validate_data(self, X) -> torch.Tensor: + """ + Validates and converts the input data `X` to the appropriate tensor format. + + Args: + X (torch.Tensor): Input data. + + Returns: + torch.Tensor: Converted to appropriate format. + """ + valid_dtypes = [torch.float32, torch.float64] + + if not isinstance(X, torch.Tensor): + X = torch.tensor(X, dtype=torch.float32) + elif self.copy: + X = X.clone() + + n_samples, n_features = X.shape + if self.n_components is None: + pass + elif self.n_components > n_features: + raise ValueError( + f"n_components={self.n_components} invalid for n_features={n_features}, " + "need more rows than columns for IncrementalPCA processing." + ) + elif self.n_components > n_samples: + raise ValueError( + f"n_components={self.n_components} must be less or equal to the batch number of samples {n_samples}" + ) + + if X.dtype not in valid_dtypes: + X = X.to(torch.float32) + + return X + + @staticmethod + def _incremental_mean_and_var( + X, last_mean, last_variance, last_sample_count + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Computes the incremental mean and variance for the data `X`. + + Args: + X (torch.Tensor): The batch input data tensor with shape (n_samples, n_features). + last_mean (torch.Tensor): The previous mean tensor with shape (n_features,). + last_variance (torch.Tensor): The previous variance tensor with shape (n_features,). + last_sample_count (torch.Tensor): The count tensor of samples processed before the current batch. + + Returns: + Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Updated mean, variance tensors, and total sample count. + """ + if X.shape[0] == 0: + return last_mean, last_variance, last_sample_count + + if last_sample_count > 0: + if last_mean is None: + raise ValueError("last_mean should not be None if last_sample_count > 0.") + if last_variance is None: + raise ValueError("last_variance should not be None if last_sample_count > 0.") + + new_sample_count = torch.tensor([X.shape[0]], device=X.device) + updated_sample_count = last_sample_count + new_sample_count + + if last_mean is None: + last_sum = torch.zeros(X.shape[1], dtype=torch.float64, device=X.device) + else: + last_sum = last_mean * last_sample_count + + new_sum = X.sum(dim=0, dtype=torch.float64) + + updated_mean = (last_sum + new_sum) / updated_sample_count + + T = new_sum / new_sample_count + temp = X - T + correction = temp.sum(dim=0, dtype=torch.float64).square() + temp.square_() + new_unnormalized_variance = temp.sum(dim=0, dtype=torch.float64) + new_unnormalized_variance -= correction / new_sample_count + if last_variance is None: + updated_variance = new_unnormalized_variance / updated_sample_count + else: + last_unnormalized_variance = last_variance * last_sample_count + last_over_new_count = last_sample_count.double() / new_sample_count + updated_unnormalized_variance = ( + last_unnormalized_variance + + new_unnormalized_variance + + last_over_new_count / updated_sample_count * (last_sum / last_over_new_count - new_sum).square() + ) + updated_variance = updated_unnormalized_variance / updated_sample_count + + return updated_mean, updated_variance, updated_sample_count + + @staticmethod + def _svd_flip(u, v, u_based_decision=True) -> tuple[torch.Tensor, torch.Tensor]: + """ + Adjusts the signs of the singular vectors from the SVD decomposition for deterministic output. + + This method ensures that the output remains consistent across different runs. + + Args: + u (torch.Tensor): Left singular vectors tensor. + v (torch.Tensor): Right singular vectors tensor. + u_based_decision (bool, optional): If True, uses the left singular vectors to determine the sign flipping. + Defaults to True. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Adjusted left and right singular vectors tensors. + """ + if u_based_decision: + max_abs_cols = torch.argmax(torch.abs(u), dim=0) + signs = torch.sign(u[max_abs_cols, range(u.shape[1])]) + else: + max_abs_rows = torch.argmax(torch.abs(v), dim=1) + signs = torch.sign(v[range(v.shape[0]), max_abs_rows]) + u *= signs[: u.shape[1]].view(1, -1) + v *= signs.view(-1, 1) + return u, v + + def fit(self, X, check_input=True): + """ + Fits the model with data `X` using minibatches of size `batch_size`. + + Args: + X (torch.Tensor): The input data tensor with shape (n_samples, n_features). + check_input (bool, optional): If True, validates the input. Defaults to True. + + Returns: + IncrementalPCA: The fitted IPCA model. + """ + if check_input: + X = self._validate_data(X) + n_samples, n_features = X.shape + if self.batch_size is None: + self.batch_size = 5 * n_features + + for batch in self.gen_batches(n_samples, self.batch_size, min_batch_size=self.n_components or 0): + self.partial_fit(X[batch], check_input=False) + + return self + + def partial_fit(self, X, check_input=True): + """ + Incrementally fits the model with batch data `X`. + + Args: + X (torch.Tensor): The batch input data tensor with shape (n_samples, n_features). + check_input (bool, optional): If True, validates the input. Defaults to True. + + Returns: + IncrementalPCA: The updated IPCA model after processing the batch. + """ + first_pass = not hasattr(self, "components_") + + if check_input: + X = self._validate_data(X) + n_samples, n_features = X.shape + + # Initialize attributes to avoid errors during the first call to partial_fit + if first_pass: + self.mean_ = None # Will be initialized properly in _incremental_mean_and_var based on data dimensions + self.var_ = None # Will be initialized properly in _incremental_mean_and_var based on data dimensions + self.n_samples_seen_ = torch.tensor([0], device=X.device) + self.n_features_ = n_features + if not self.n_components: + self.n_components = min(n_samples, n_features) + + if n_features != self.n_features_: + raise ValueError( + "Number of features of the new batch does not match the number of features of the first batch." + ) + + col_mean, col_var, n_total_samples = self._incremental_mean_and_var( + X, self.mean_, self.var_, self.n_samples_seen_ + ) + + if first_pass: + X -= col_mean + else: + col_batch_mean = torch.mean(X, dim=0) + X -= col_batch_mean + mean_correction_factor = torch.sqrt((self.n_samples_seen_.double() / n_total_samples) * n_samples) + mean_correction = mean_correction_factor * (self.mean_ - col_batch_mean) + X = torch.vstack( + ( + self.singular_values_.view((-1, 1)) * self.components_, + X, + mean_correction, + ) + ) + + if self.lowrank: + U, S, Vt = self._svd_fn_lowrank(X) + else: + U, S, Vt = self._svd_fn_full(X) + U, Vt = self._svd_flip(U, Vt, u_based_decision=False) + explained_variance = S**2 / (n_total_samples - 1) + explained_variance_ratio = S**2 / torch.sum(col_var * n_total_samples) + + self.n_samples_seen_ = n_total_samples + self.components_ = Vt[: self.n_components] + self.singular_values_ = S[: self.n_components] + self.mean_ = col_mean + self.var_ = col_var + self.explained_variance_ = explained_variance[: self.n_components] + self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components] + if self.n_components not in (n_samples, n_features): + self.noise_variance_ = explained_variance[self.n_components :].mean() + else: + self.noise_variance_ = torch.tensor(0.0, device=X.device) + return self + + def transform(self, X) -> torch.Tensor: + """ + Applies dimensionality reduction to `X`. + + The input data `X` is projected on the first principal components previously extracted from a training set. + + Args: + X (torch.Tensor): New data tensor with shape (n_samples, n_features) to be transformed. + + Returns: + torch.Tensor: Transformed data tensor with shape (n_samples, n_components). + """ + X = X - self.mean_ + return torch.mm(X.double(), self.components_.T).to(X.dtype) + + @staticmethod + def gen_batches(n: int, batch_size: int, min_batch_size: int = 0): + """Generator to create slices containing `batch_size` elements from 0 to `n`. + + The last slice may contain less than `batch_size` elements, when `batch_size` does not divide `n`. + + Args: + n (int): Size of the sequence. + batch_size (int): Number of elements in each batch. + min_batch_size (int, optional): Minimum number of elements in each batch. Defaults to 0. + + Yields: + slice: A slice of `batch_size` elements. + """ + start = 0 + for _ in range(int(n // batch_size)): + end = start + batch_size + if end + min_batch_size > n: + continue + yield slice(start, end) + start = end + if start < n: + yield slice(start, n) diff --git a/peft/src/peft/utils/integrations.py b/peft/src/peft/utils/integrations.py new file mode 100644 index 0000000000000000000000000000000000000000..dc5ae465dbe89afaadde6aa9a7a05a0db3132694 --- /dev/null +++ b/peft/src/peft/utils/integrations.py @@ -0,0 +1,281 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import functools +from contextlib import contextmanager +from typing import Literal, Optional + +import packaging.version +import torch +import transformers +from torch import nn + + +def check_deepspeed_zero3_enabled() -> bool: + if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"): + from transformers.integrations import is_deepspeed_zero3_enabled + else: + from transformers.deepspeed import is_deepspeed_zero3_enabled + return is_deepspeed_zero3_enabled() + + +@contextmanager +def gather_params_ctx(param, modifier_rank: Optional[int] = 0, fwd_module: torch.nn.Module = None): + """Call DeepSpeed GatheredParameters context manager if DeepSpeed is enabled, otherwise do nothing.""" + + if not check_deepspeed_zero3_enabled(): + yield + return + + import deepspeed + + with deepspeed.zero.GatheredParameters(param, modifier_rank=modifier_rank, fwd_module=fwd_module): + yield + return + + +def dequantize_module_weight(module: torch.nn.Module) -> torch.nn.Parameter: + """ + Helper function to dequantize a quantized weight. + + This function should be extended if more quantization schemes are added to the library. + + If the weight is not quantized, it will be returned as is. + """ + if hasattr(module, "W_q"): # For handling HQQ quantized weight + weight = module.dequantize() + return weight + elif type(module.weight).__module__.startswith("torchao."): + # check for torchao without requiring any torchao imports + weight = module.weight.dequantize() + return weight + + weight = module.weight + if not isinstance(weight, torch.nn.Parameter): + if isinstance(weight, torch.Tensor): + # this is an FSDP-specific edge case + return weight # type: ignore + raise TypeError(f"Input weight should be of type nn.Parameter, got {type(weight)} instead") + + cls_name = weight.__class__.__name__ + if cls_name not in ("Params4bit", "Int8Params"): + return weight + + quant_state = getattr(module, "state", None) + device = weight.device + is_cpu = device.type == torch.device("cpu").type + weight = dequantize_bnb_weight(weight, state=quant_state) # no-op if not bnb + if is_cpu: + # dequantize_bnb_weight for 8bit moves the device in-place, thus we need to move it back to CPU if necessary + module.weight = module.weight.to(device) + return weight + + +def dequantize_bnb_weight(weight: torch.nn.Parameter, state=None): + """Helper function to dequantize 4bit or 8bit bnb weights.""" + import bitsandbytes as bnb + + if state.SCB is None: + state.SCB = weight.SCB + + device = weight.device + + cls_name = weight.__class__.__name__ + if cls_name == "Params4bit": + dequantized = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + return dequantized + + if hasattr(bnb.functional, "int8_vectorwise_dequant"): + # Use bitsandbytes API if available (requires v0.45.0+) + dequantized = bnb.functional.int8_vectorwise_dequant(weight.data, state.SCB) + else: + # Multiply by (scale/127) to dequantize. + dequantized = weight.data * state.SCB.view(-1, 1) * 7.874015718698502e-3 + + return dequantized + + +def get_bnb_param_type(param: torch.nn.Parameter) -> Literal[False, "4bit", "8bit"]: + """Returns '4bit' or '8bit' if bitsandbytes parameter, else False""" + if param.__class__.__name__ == "Params4bit": + return "4bit" + if param.__class__.__name__ == "Int8Params": + return "8bit" + return False + + +# adapted from: +# https://github.com/huggingface/transformers/blob/eab6c491d439e83d5e31c660df6f7e36592eb0a2/src/transformers/generation/utils.py#L1617-L1643 +def get_layer_device_map(model): + """ + Derive the device map for the layers of the model. + """ + main_device = [d for d in model.hf_device_map.values() if d not in ["cpu", "disk"]][0] + + execution_device_map = { + name: main_device if device in ["cpu", "disk"] else device for name, device in model.hf_device_map.items() + } + + if execution_device_map is None: + return None + + if len(execution_device_map) == 1 and "" in execution_device_map: + return {idx: execution_device_map[""] for idx in range(model.config.num_hidden_layers)} + + layer_device_map = {} + for layer in execution_device_map: + for idx in range(model.config.num_hidden_layers): + if f".{idx}." in f"{layer}.": + layer_device_map[idx] = execution_device_map[layer] + break + for idx in range(model.config.num_hidden_layers): + if idx not in layer_device_map: + raise RuntimeError(f"layer {idx} has not been mapped to a device.") + return layer_device_map + + +# adapted from: +# https://github.com/huggingface/transformers/blob/eab6c491d439e83d5e31c660df6f7e36592eb0a2/src/transformers/cache_utils.py#L1159-L1179 +def map_cache_to_layer_device_map(model, cache) -> None: + """ + Ensure that the key and value cache of the model are on the same device as their corresponding layers. + """ + if not (isinstance(cache, transformers.Cache) and hasattr(model, "hf_device_map")): + return + + if isinstance(cache, transformers.EncoderDecoderCache): + map_cache_to_layer_device_map(model, cache.self_attention_cache) + return + + layer_device_map = get_layer_device_map(model) + for idx in range(model.config.num_hidden_layers): + layer_device = layer_device_map[idx] + if hasattr(cache, "layers"): + # new transformers uses cache.layers (>v4.55) + layer = cache.layers[idx] + layer.keys = layer.keys.to(layer_device) + layer.values = layer.values.to(layer_device) + else: + # old transformers uses cache.{key,value}_cache (<=v4.55) + # TODO: remove if we drop support for transformers <= 4.55 + cache.key_cache[idx] = cache.key_cache[idx].to(layer_device) + cache.value_cache[idx] = cache.value_cache[idx].to(layer_device) + + +################################## +# START: ADAPTED FROM ACCELERATE # +################################## +# +# Modified to support explicitly skipping layer initialization for faster switching between layer states +# (necessary for supporting `nn.MultiHeadAttention` adapters) + + +@contextmanager +def init_empty_weights(include_buffers: bool = None): + # adapted from accelerate.big_modeling.py + with _init_on_device(torch.device("meta"), include_buffers=include_buffers) as f: + yield f + + +@contextmanager +def _init_on_device(device: torch.device, include_buffers: bool = None): + # adapted from accelerate.big_modeling.py + old_register_parameter = nn.Module.register_parameter + if include_buffers: + old_register_buffer = nn.Module.register_buffer + + def register_empty_parameter(module, name, param): + # This works because torch first initializes the parameters with torch.empty, thus not assigning any new memory. + # Then the parameter is moved to meta device before reset_parameters() is called, which then operates on the + # meta device, making any subsequent calls to initialization methods no-ops. + old_register_parameter(module, name, param) + if (param is not None) and (getattr(_init_on_device, "_skip", False) is not True): + param_cls = type(module._parameters[name]) + kwargs = module._parameters[name].__dict__ + kwargs["requires_grad"] = param.requires_grad + module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) + + def register_empty_buffer(module, name, buffer, persistent=True): + old_register_buffer(module, name, buffer, persistent=persistent) + if buffer is not None: + module._buffers[name] = module._buffers[name].to(device) + + # Patch tensor creation + if include_buffers: + tensor_constructors_to_patch = { + torch_function_name: getattr(torch, torch_function_name) + for torch_function_name in ["empty", "zeros", "ones", "full"] + } + else: + tensor_constructors_to_patch = {} + + def patch_tensor_constructor(fn): + def wrapper(*args, **kwargs): + kwargs["device"] = device + return fn(*args, **kwargs) + + return wrapper + + try: + nn.Module.register_parameter = register_empty_parameter + if include_buffers: + nn.Module.register_buffer = register_empty_buffer + for torch_function_name in tensor_constructors_to_patch.keys(): + setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name))) + yield + finally: + nn.Module.register_parameter = old_register_parameter + if include_buffers: + nn.Module.register_buffer = old_register_buffer + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): + setattr(torch, torch_function_name, old_torch_function) + + +@contextmanager +def _skip_init_on_device(): + # context manager to skip the _init_on_device context manager + old_val = getattr(_init_on_device, "_skip", False) + try: + _init_on_device._skip = True + yield + finally: + _init_on_device._skip = old_val + + +def skip_init_on_device(func): + """ + Ignore the init_on_device context manager when calling the decorated function. + + This is a narrow use decorator that allows us to avoid initializing on meta device even when we're inside the + init_empty_weights context. + + """ + + # The need for this functionality arose when working on MultiheadAttention, where we have to call _restore_weights + # repeatedly as parametes are overwritten and need to be re-registered. When using low_cpu_mem_usage=True, as + # register_parameter is patched inside of the init_empty_weights context, this would result in those parameters + # suddenly being moved to meta device. Using this decorator allows us to avoid this. + @functools.wraps(func) + def wrapper(*args, **kwargs): + with _skip_init_on_device(): + return func(*args, **kwargs) + + return wrapper + + +####### +# END # +####### diff --git a/peft/src/peft/utils/loftq_utils.py b/peft/src/peft/utils/loftq_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a5b19b44673c3d1bbf7cb97189e79a8bfc9806af --- /dev/null +++ b/peft/src/peft/utils/loftq_utils.py @@ -0,0 +1,409 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference code: https://github.com/yxli2123/LoftQ/blob/main/utils.py +# Reference paper: https://huggingface.co/papers/2310.08659 + +from __future__ import annotations + +import logging +import os +from typing import Callable, Optional, Union + +import torch +from accelerate.utils.memory import clear_device_cache +from huggingface_hub import snapshot_download +from huggingface_hub.errors import HFValidationError, LocalEntryNotFoundError +from safetensors import SafetensorError, safe_open +from transformers.utils import cached_file +from transformers.utils.hub import get_checkpoint_shard_files + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available + + +class NFQuantizer: + def __init__(self, num_bits=2, device="cuda", method="normal", block_size=64, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_bits = num_bits + self.device = device + self.method = method + self.block_size = block_size + if self.method == "normal": + self.norm_lookup_table = self.create_normal_map(num_bits=self.num_bits) + self.norm_lookup_table = self.norm_lookup_table.to(device) + elif self.method == "uniform": + self.norm_lookup_table = self.create_uniform_map(num_bits=self.num_bits) + self.norm_lookup_table = self.norm_lookup_table.to(device) + else: + raise NotImplementedError("Other quantization methods not supported yet.") + + @staticmethod + def create_uniform_map(symmetric=False, num_bits=4): + if symmetric: + # print("symmetric uniform quantization") + negative = torch.linspace(-1, 0, 2 ** (num_bits - 1)) + positive = torch.linspace(0, 1, 2 ** (num_bits - 1)) + table = torch.cat([negative, positive[1:]]) + else: + # print("asymmetric uniform quantization") + table = torch.linspace(-1, 1, 2**num_bits) + return table + + @staticmethod + def create_normal_map(offset=0.9677083, symmetric=False, num_bits=2): + try: + from scipy.stats import norm + except ImportError: + raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") + + variations = 2**num_bits + if symmetric: + v = norm.ppf(torch.linspace(1 - offset, offset, variations + 1)).tolist() + values = [] + for index in range(len(v) - 1): + values.append(0.5 * v[index] + 0.5 * v[index + 1]) + v = values + else: + # one more positive value, this is an asymmetric type + v1 = norm.ppf(torch.linspace(offset, 0.5, variations // 2 + 1)[:-1]).tolist() + v2 = [0] + v3 = (-norm.ppf(torch.linspace(offset, 0.5, variations // 2)[:-1])).tolist() + v = v1 + v2 + v3 + + values = torch.Tensor(v) + values = values.sort().values + values /= values.max() + return values + + def quantize_tensor(self, weight): + max_abs = torch.abs(weight).max() + weight_normed = weight / max_abs + + weight_normed_expanded = weight_normed.unsqueeze(-1) + + # Reshape L to have the same number of dimensions as X_expanded + L_reshaped = torch.tensor(self.norm_lookup_table).reshape(1, -1) + + # Calculate the absolute difference between X_expanded and L_reshaped + abs_diff = torch.abs(weight_normed_expanded - L_reshaped) + + # Find the index of the minimum absolute difference for each element + qweight = torch.argmin(abs_diff, dim=-1) + return qweight, max_abs + + def dequantize_tensor(self, qweight, max_abs): + qweight_flatten = qweight.flatten() + + weight_normed = self.norm_lookup_table[qweight_flatten] + weight = weight_normed * max_abs + + weight = weight.reshape(qweight.shape) + + return weight + + def quantize_block(self, weight): + if len(weight.shape) != 2: + raise ValueError(f"Only support 2D matrix, but your input has {len(weight.shape)} dimensions.") + if weight.shape[0] * weight.shape[1] % self.block_size != 0: + raise ValueError( + f"Weight with shape ({weight.shape[0]} x {weight.shape[1]}) " + f"is not dividable by block size {self.block_size}." + ) + + M, N = weight.shape + device = weight.device + + # Quantization + weight_flatten = weight.flatten() # (M*N, ) + weight_block = weight_flatten.reshape(-1, self.block_size) # (L, B), L = M * N / B + if self.method == "normal": + weight_max = weight_block.abs().max(dim=-1)[0] # (L, 1) + elif self.method == "uniform": + weight_max = weight_block.mean(dim=-1) + 2.5 * weight_block.std(dim=-1) + else: + raise NotImplementedError("Method not supported yet.") + weight_max = weight_max.unsqueeze(-1) + weight_divabs = weight_block / weight_max # (L, B) + weight_divabs = weight_divabs.unsqueeze(-1) # (L, B, 1) + L_reshaped = self.norm_lookup_table.reshape(1, -1) # (1, 2**K) + + abs_diff = torch.abs(weight_divabs - L_reshaped) # (L, B, 2**K) + qweight = torch.argmin(abs_diff, dim=-1) # (L, B) + + # Pack multiple k-bit into uint8 + qweight = qweight.reshape(-1, 8 // self.num_bits) + qweight_pack = torch.zeros((M * N // 8 * self.num_bits, 1), dtype=torch.uint8, device=device) + + # data format example: + # [1, 0, 3, 2] or [01, 00, 11, 10] -> [10110001], LIFO + for i in range(8 // self.num_bits): + qweight[:, i] = qweight[:, i] << i * self.num_bits + qweight_pack[:, 0] |= qweight[:, i] + + return qweight_pack, weight_max, weight.shape + + def dequantize_block(self, qweight, weight_max, weight_shape): + # unpack weight + device = qweight.device + weight = torch.zeros((qweight.shape[0], 8 // self.num_bits), dtype=torch.float32, device=device) + for i in range(8 // self.num_bits): + lookup_table_idx = qweight.to(torch.long) % 2**self.num_bits # get the most right 2 bits + lookup_table_idx = lookup_table_idx.to(torch.long) + weight[:, i] = self.norm_lookup_table[lookup_table_idx].squeeze() + qweight = qweight >> self.num_bits # right shift 2 bits of the original data + + weight_block = weight.reshape(-1, self.block_size) + weight = weight_block * weight_max + weight = weight.reshape(weight_shape) + + return weight + + +def _low_rank_decomposition(weight, reduced_rank=32): + """ + :param weight: The matrix to decompose, of shape (H, W) :param reduced_rank: the final rank :return: + """ + matrix_dimension = len(weight.size()) + if matrix_dimension != 2: + raise ValueError(f"Only support 2D matrix, but your input has {matrix_dimension} dimensions.") + + # Use SVD to decompose a matrix, default full_matrices is False to save parameters + U, S, Vh = torch.linalg.svd(weight, full_matrices=False) + + L = U @ (torch.sqrt(torch.diag(S)[:, 0:reduced_rank])) + R = torch.sqrt(torch.diag(S)[0:reduced_rank, :]) @ Vh + + return {"L": L, "R": R, "U": U, "S": S, "Vh": Vh, "reduced_rank": reduced_rank} + + +@torch.no_grad() +def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, reduced_rank: int, num_iter=1): + if is_bnb_available(): + import bitsandbytes as bnb + else: + raise ValueError("bitsandbytes is not available, please install it to use LoftQ.") + + if num_bits not in [2, 4, 8]: + raise ValueError("Only support 2, 4, 8 bits quantization") + if num_iter <= 0: + raise ValueError("Number of iterations must be greater than 0") + + out_feature, in_feature = weight.size() + device = weight.device + dtype = weight.dtype + logging.info( + f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} | Num Iter: {num_iter} | Num Bits: {num_bits}" + ) + if not is_bnb_4bit_available() or num_bits in [2, 8]: + quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64) + compute_device = device + else: + compute_device = "xpu" if is_xpu_available() else "cuda" + + weight = weight.to(device=compute_device, dtype=torch.float32) + res = weight.clone() + for i in range(num_iter): + clear_device_cache() + # Quantization + if num_bits == 4 and is_bnb_4bit_available(): + qweight = bnb.nn.Params4bit( + res.to("cpu"), requires_grad=False, compress_statistics=False, quant_type="nf4" + ).to(compute_device) + dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state) + else: + quantized_weight, max_abs, shape = quantizer.quantize_block(res) + dequantized_weight = quantizer.dequantize_block(quantized_weight, max_abs, shape) + + res = weight - dequantized_weight + + # Decompose the residual by SVD + output = _low_rank_decomposition(res, reduced_rank=reduced_rank) + L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"] + res = weight - torch.mm(L, R) + + lora_A, lora_B = R, L + + return dequantized_weight.to(device=device, dtype=dtype), lora_A, lora_B + + +@torch.no_grad() +def _loftq_init_new(qweight, weight, num_bits: int, reduced_rank: int): + import bitsandbytes as bnb + + if num_bits != 4: + raise ValueError("Only 4 bit quantization supported at the moment.") + if not is_bnb_4bit_available(): + raise ValueError("bitsandbytes 4bit quantization is not available.") + + compute_device = "xpu" if is_xpu_available() else "cuda" + dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state) + + weight = weight.to(device=compute_device, dtype=torch.float32) + residual = weight - dequantized_weight + clear_device_cache() + # Decompose the residualidual by SVD + output = _low_rank_decomposition(residual, reduced_rank=reduced_rank) + L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"] + return R, L + + +class _SafetensorLoader: + """ + Simple utility class that loads tensors with safetensors from a single file or sharded files. + + Takes care of file name normalization etc. + + """ + + def __init__(self, peft_model, model_path): + if model_path is None: + try: + model_path = snapshot_download(peft_model.base_model.config._name_or_path, local_files_only=True) + except (AttributeError, HFValidationError) as exc: + raise ValueError( + "The provided model does not appear to be a transformers model or is a local model. In this case, " + "you must pass the model_path argument that points to the safetensors file." + ) from exc + except LocalEntryNotFoundError as exc: + raise ValueError( + "The model.safetensors file must be present on disk, but it could not be found." + ) from exc + + suffix = "model.safetensors" + if not model_path.endswith(suffix): + model_path = os.path.join(model_path, suffix) + + self.model_path = model_path + self.base_model_prefix = getattr(peft_model.get_base_model(), "base_model_prefix", None) + self.prefix = "base_model.model." + self.is_sharded = False + self.weight_map = None + + if not os.path.exists(model_path): + # check if the file is sharded + par_dir = model_path.rpartition(os.path.sep)[0] + try: + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + par_dir, cached_file(par_dir, "model.safetensors.index.json") + ) + except OSError as exc: + raise FileNotFoundError( + f"Could not find file for {model_path}, ensure that there is a (sharded) safetensors file of the model." + ) from exc + + self.is_sharded = True + # maps from 'model-X-of-Y.safetensors' to full file path + file_map = {k.rpartition(os.path.sep)[-1]: k for k in resolved_archive_file} + self.weight_map = {k: file_map[v] for k, v in sharded_metadata["weight_map"].items()} + + def get_tensor(self, name): + if not self.is_sharded: + file_path = self.model_path + else: + file_path = self.weight_map[name] + + with safe_open(file_path, framework="pt", device="cpu") as f: + try: + tensor = f.get_tensor(name) + except SafetensorError as exc: + # no matching key found, we probably need to remove the base model prefix + if self.base_model_prefix: + # remove 1 extra character for "." + name = name[len(self.base_model_prefix) + 1 :] + tensor = f.get_tensor(name) + else: + raise exc + return tensor + + +@torch.no_grad() +def replace_lora_weights_loftq( + peft_model, + model_path: Optional[str] = None, + adapter_name: str = "default", + callback: Optional[Callable[[torch.nn.Module, str], bool]] = None, +): + """ + Replace the LoRA weights of a model quantized with bitsandbytes, using the LoftQ technique. + + The replacement is done on the fly by loading in the non-quantized weights from a locally stored safetensors model + file and initializing the LoRA weights such that the quantization error between the original and quantized weights + is minimized. + + As lazy loading is not possible with pickle, normal PyTorch checkpoint files cannot be supported. + + Depending on the model size, calling this function may take some time to finish. + + Args: + peft_model (`PeftModel`): + The model to replace the weights of. Must be a quantized PEFT model with LoRA layers. + model_path (`Optional[str]`): + The path to the model safetensors file. If the model is a Hugging Face model, this will be inferred from + the model's config. Otherwise, it must be provided. + adapter_name (`str`): + The name of the adapter to replace the weights of. The default adapter name is "default". + callback (`Optional[Callable[[PeftModel, str], bool]]`): + A callback function that will be called after each module is replaced. The callback function should take + the model and the name of the current module as input and return a boolean indicating whether the + replacement should be kept. If the callback returns False, the replacement will be rolled back. This can be + very useful to confirm that the LoftQ initialization actually decreases the quantization error of the + model. As an example, this callback could generate logits for given input and compare it with the logits + from the original, non-quanitzed model with the same input, and only return `True` if there is an + improvement. As this is a greedy optimization, it's possible that calling this function multiple times + yields incremental improvements. + """ + if not is_bnb_4bit_available(): + raise ValueError("bitsandbytes must be installed and the model must be quantized in 4bits.") + + from peft.tuners.lora import Linear4bit + + # model_path = _check_model_path_loftq(model_path, peft_model) + prefix = "base_model.model." + any_match = False + safetensor_loader = _SafetensorLoader(peft_model, model_path) + + # if too slow, consider adding tqdm as an option + for name, module in peft_model.named_modules(): + if not isinstance(module, Linear4bit): + continue + + if not name.startswith(prefix): + raise TypeError("The passed model does not appear to be a valid PeftModel") + + any_match = True + name = name[len(prefix) :] + tensor = safetensor_loader.get_tensor(name + ".weight") + + reduced_rank = module.r[adapter_name] + lora_A, lora_B = _loftq_init_new(module.weight, tensor, num_bits=4, reduced_rank=reduced_rank) + if not callback: + module.lora_A[adapter_name].weight.data = lora_A + module.lora_B[adapter_name].weight.data = lora_B + continue + + lora_A_before = module.lora_A[adapter_name].weight.data + lora_B_before = module.lora_B[adapter_name].weight.data + + module.lora_A[adapter_name].weight.data = lora_A + module.lora_B[adapter_name].weight.data = lora_B + should_replace = callback(peft_model, name) + if not should_replace: + # roll back + module.lora_A[adapter_name].weight.data = lora_A_before + module.lora_B[adapter_name].weight.data = lora_B_before + + del lora_A_before, lora_B_before + + if not any_match: + raise ValueError("No bnb LoRA module found on the model") diff --git a/peft/src/peft/utils/merge_utils.py b/peft/src/peft/utils/merge_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b62a1abf1eefe17d596461c529925e47e378c563 --- /dev/null +++ b/peft/src/peft/utils/merge_utils.py @@ -0,0 +1,268 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Literal + +import torch + + +def reshape_weight_task_tensors(task_tensors, weights): + """ + Reshapes `weights` to match the shape of `task_tensors` by unsqeezing in the remaining dimenions. + + Args: + task_tensors (`torch.Tensor`): The tensors that will be used to reshape `weights`. + weights (`torch.Tensor`): The tensor to be reshaped. + + Returns: + `torch.Tensor`: The reshaped tensor. + """ + new_shape = weights.shape + (1,) * (task_tensors.dim() - weights.dim()) + weights = weights.view(new_shape) + return weights + + +def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor: + """ + Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction + `density`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The tensor with the pruned weights. + """ + mask = torch.zeros_like(tensor).reshape(-1) + k = int(density * tensor.numel()) + top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True) + mask[top_k[1]] = 1 + return tensor * mask.reshape(tensor.shape) + + +def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor: + """ + Prune random values based on the specified fraction `density`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor. + + Returns: + `torch.Tensor`: The pruned tensor. + """ + mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density)) + pruned_tensor = tensor * mask + if rescale: + torch.div(input=pruned_tensor, other=density) + return pruned_tensor + + +def prune( + tensor: torch.Tensor, density: float, method: Literal["magnitude", "random"], rescale: bool = False +) -> torch.Tensor: + """ + Prune the values of task tensors based on the `method`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + method (`str`):The method to use to prune. Should be one of ["magnitude", "random"]. + rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor. + + Returns: + `torch.Tensor`: The pruned tensor. + """ + if density >= 1: + warnings.warn(f"The density {density} is greater than or equal to 1, no pruning will be performed.") + return tensor + elif density < 0: + raise ValueError(f"Density should be >= 0, got {density}") + if method == "magnitude": + return magnitude_based_pruning(tensor, density) + elif method == "random": + return random_pruning(tensor, density, rescale=rescale) + else: + raise ValueError(f"Unknown method {method}") + + +def calculate_majority_sign_mask( + tensor: torch.Tensor, method: Literal["total", "frequency"] = "total" +) -> torch.Tensor: + """ + Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0. + + Args: + tensor (`torch.Tensor`):The tensor to get the mask from. + method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The majority sign mask. + """ + + sign = tensor.sign() + if method == "total": + sign_magnitude = tensor.sum(dim=0) + elif method == "frequency": + sign_magnitude = sign.sum(dim=0) + else: + raise RuntimeError(f'Unimplemented mask method "{method}"') + majority_sign = torch.where(sign_magnitude >= 0, 1, -1) + return sign == majority_sign + + +def disjoint_merge(task_tensors: torch.Tensor, majority_sign_mask: torch.Tensor) -> torch.Tensor: + """ + Merge the task tensors using disjoint merge. + + Args: + task_tensors (`torch.Tensor`):The task tensors to merge. + majority_sign_mask (`torch.Tensor`):The mask of the majority sign across the task tensors. + + Returns: + `torch.Tensor`: The merged tensor. + """ + mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0) + num_params_preserved = majority_sign_mask.sum(dim=0) + return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0) + + +def task_arithmetic(task_tensors: list[torch.Tensor], weights: torch.Tensor) -> torch.Tensor: + """ + Merge the task tensors using `task arithmetic`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + + Returns: + `torch.Tensor`: The merged tensor. + """ + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def magnitude_prune(task_tensors: list[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor: + """ + Merge the task tensors using `task arithmetic`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`): The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def ties( + task_tensors: list[torch.Tensor], + weights: torch.Tensor, + density: float, + majority_sign_method: Literal["total", "frequency"] = "total", +) -> torch.Tensor: + """ + Merge the task tensors using `ties`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + majority_sign_method (`str`): + The method to use to get the majority sign mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # Elect Sign + majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + # Disjoint Merge + mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask) + return mixed_task_tensors + + +def dare_linear(task_tensors: list[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor: + """ + Merge the task tensors using `dare linear`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def dare_ties( + task_tensors: list[torch.Tensor], + weights: torch.Tensor, + density: float, + majority_sign_method: Literal["total", "frequency"] = "total", +) -> torch.Tensor: + """ + Merge the task tensors using `dare ties`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + majority_sign_method (`str`): + The method to use to get the majority sign mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # Elect Sign + majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + # Disjoint Merge + mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask) + return mixed_task_tensors diff --git a/peft/src/peft/utils/other.py b/peft/src/peft/utils/other.py new file mode 100644 index 0000000000000000000000000000000000000000..6f8437152efdabc642099277d6d969062e058c64 --- /dev/null +++ b/peft/src/peft/utils/other.py @@ -0,0 +1,1532 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import copy +import functools +import inspect +import os +import re +import warnings +from collections.abc import Sequence +from contextlib import nullcontext +from operator import attrgetter +from typing import Any, Optional, Union + +import accelerate +import torch +import transformers +from accelerate import FullyShardedDataParallelPlugin +from accelerate.hooks import add_hook_to_module, remove_hook_from_module +from accelerate.utils import is_npu_available, is_xpu_available +from huggingface_hub import file_exists +from huggingface_hub.errors import EntryNotFoundError, HFValidationError +from packaging import version +from safetensors.torch import storage_ptr, storage_size +from transformers import PreTrainedModel + +from ..import_utils import is_auto_gptq_available, is_gptqmodel_available, is_torch_tpu_available +from .constants import ( + CONFIG_NAME, + EMBEDDING_LAYER_NAMES, + INCLUDE_LINEAR_LAYERS_SHORTHAND, + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING, + WEIGHTS_NAME, + bloom_model_postprocess_past_key_value, + starcoder_model_postprocess_past_key_value, +) + + +mlu_available = False +if version.parse(accelerate.__version__) >= version.parse("0.29.0"): + from accelerate.utils import is_mlu_available + + mlu_available = is_mlu_available() + + +__all__ = [ + "CONFIG_NAME", + "EMBEDDING_LAYER_NAMES", + "INCLUDE_LINEAR_LAYERS_SHORTHAND", + "SAFETENSORS_WEIGHTS_NAME", + "TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_BOFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_BONE_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_C3A_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_FOURIERFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_HRA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LNTUNING_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LOHA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LOKR_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_MISS_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_OFT_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_POLY_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING", + "TRANSFORMERS_MODELS_TO_RANDLORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_ROAD_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_SHIRA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_VBLORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_VERA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_WAVEFT_TARGET_MODULES_MAPPING", + "WEIGHTS_NAME", + "bloom_model_postprocess_past_key_value", + "starcoder_model_postprocess_past_key_value", +] + + +# Get current device name based on available devices +def infer_device() -> str: + if torch.cuda.is_available(): + return "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + elif mlu_available: + return "mlu" + elif is_xpu_available(): + return "xpu" + elif is_npu_available(): + return "npu" + return "cpu" + + +def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None): + r""" + Note this method only works for `transformers` models. + + This method wraps the entire protocol for preparing a model before running a training. This includes: + 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm + head to fp32 4- Freezing the base model layers to ensure they are not updated during training + + + Args: + model (`transformers.PreTrainedModel`): + The loaded model from `transformers` + use_gradient_checkpointing (`bool`, *optional*, defaults to `True`): + If True, use gradient checkpointing to save memory at the expense of slower backward pass. + gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`): + Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of + `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method. + Note this is only available in the latest transformers versions (> 4.34.1). + """ + loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) + is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" + is_aqlm_quantized = getattr(model, "quantization_method", None) == "aqlm" + is_eetq_quantized = getattr(model, "quantization_method", None) == "eetq" + is_torchao_quantized = getattr(model, "quantization_method", None) == "torchao" + is_hqq_quantized = getattr(model, "quantization_method", None) == "hqq" or getattr(model, "hqq_quantized", False) + + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + + for name, param in model.named_parameters(): + # freeze base model's layers + param.requires_grad = False + + if ( + not is_gptq_quantized + and not is_aqlm_quantized + and not is_eetq_quantized + and not is_hqq_quantized + and not is_torchao_quantized + ): + # cast all non INT8 parameters to fp32 + for param in model.parameters(): + if ( + (param.dtype == torch.float16) or (param.dtype == torch.bfloat16) + ) and param.__class__.__name__ != "Params4bit": + param.data = param.data.to(torch.float32) + + if ( + loaded_in_kbit + or is_gptq_quantized + or is_aqlm_quantized + or is_eetq_quantized + or is_hqq_quantized + or is_torchao_quantized + ) and use_gradient_checkpointing: + # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack + if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]: + # For backward compatibility + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # To support older transformers versions, check if the model supports gradient_checkpointing_kwargs + _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list( + inspect.signature(model.gradient_checkpointing_enable).parameters + ) + + if not _supports_gc_kwargs and len(gradient_checkpointing_kwargs) > 0: + warnings.warn( + "gradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored." + " if you want to use that feature, please upgrade to the latest version of transformers.", + FutureWarning, + ) + + gc_enable_kwargs = ( + {} if not _supports_gc_kwargs else {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs} + ) + + # enable gradient checkpointing for memory efficiency + model.gradient_checkpointing_enable(**gc_enable_kwargs) + return model + + +# copied from transformers.models.bart.modeling_bart +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids + pad_token_id (`int`): The id of the `padding` token. + decoder_start_token_id (`int`): The id of the `start` token. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class AuxiliaryTrainingWrapper(torch.nn.Module): + """Wrap a specific module so that it can be trained and saved in a way that is tangential to how + PEFT normally works, e.g. fully training a classification layer instead of using an adapter. + + """ + + # All names of layers that may contain adapter (trainable) weights + adapter_layer_names: tuple[str, ...] = () + # All names of other parameters that may contain adapter-related parameters + other_param_names: tuple[str, ...] = () + # List all merged adapters + merged_adapters: list[str] = [] + + def __init__(self, module_to_save, adapter_name, **kwargs): + """Extra kwargs will be passed to `self.init_modules` and `self.update`.""" + super().__init__() + self.original_module = module_to_save + self._active_adapter = [adapter_name] + self._disable_adapters = False + self._adapters = set() + + self.init_modules(adapter_name, **kwargs) + + self.update(adapter_name, **kwargs) + self.check_module() + + def init_modules(self, adapter_name, **kwargs): + """A place to initialize PyTorch modules in `__init__` before the call to `self.update()`.""" + raise NotImplementedError + + def _get_available_adapters(self) -> set[str]: + """Return all adapter names that can be found on this module.""" + raise NotImplementedError + + def _error_message_name(self): + """Returns a user friendly identifier for error messages, e.g. for type compatibility error messages from + `check_module()` so that the user can backtrack where the error comes from. A generic "training wrapper" is + less helpful than "modules_to_save", for example. + """ + return "training wrapper" + + def check_module(self): + """Perform some sanity checks on the module to ensure that it works""" + # Try to anticipate some modules that users could try to target that would not work. + # Note: It's not possible to check hasattr(module, "forward"), since that returns True for ModuleDict and + # ModuleList, even though their forward methods cannot be called + forbidden_classes = (torch.nn.ModuleDict, torch.nn.ModuleList, torch.nn.ParameterDict, torch.nn.ParameterList) + if isinstance(self.original_module, forbidden_classes): + cls_name = self.original_module.__class__ + raise TypeError(f"{self._error_message_name()} cannot be applied to modules of type {cls_name}") + + # local import to avoid circular import + from peft.tuners.tuners_utils import BaseTunerLayer + + if isinstance(self.original_module, BaseTunerLayer): + # e.g. applying a training wrapper to a lora layer makes no sense + cls_name = self.original_module.__class__ + raise TypeError(f"{self._error_message_name()} cannot be applied to modules of type {cls_name}") + + @property + def disable_adapters(self) -> bool: + # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method + return self._disable_adapters + + @property + def active_adapter(self) -> Union[list[str], str]: + # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method + return self._active_adapter + + @property + def active_adapters(self) -> list[str]: + if isinstance(self._active_adapter, str): + return [self._active_adapter] + return self._active_adapter + + def _hasattr_wrapped(self, name, modules): + """Infrastructure to enable the implementing class to delegate attributes to other modules. + Returns True if the implementing class knows how to handle attribute `name`. + + Gets passed `modules` which is PyTorch's internal list of assigned modules from `nn.Module`. + """ + return False + + def _getattr_wrapped(self, name, modules): + """If `_hasattr_wrapped` returns True for `name`, then this function should return the corresponding + value associated with `name`. + """ + return None + + def __getattr__(self, name: str): + # Note: This whole method may seem overly complex at first but PyTorch messes with __getattr__ in a way that + # requires very careful handling to avoid infinite recursion. + try: + return super().__getattr__(name) + except AttributeError: + pass + + if "_modules" not in self.__dict__: + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + # Could not find the attribute the PyTorch way. So let's check if it's an attribute on the + # original_module or the module further down (e.g., `modules_to_save[active_adapter]`). + modules = self.__dict__["_modules"] + if self.disable_adapters: + return getattr(self.original_module, name) + elif self._hasattr_wrapped(name, modules): + return self._getattr_wrapped(name, modules) + + # For some reason, there is no module corresponding to the active adapter; this should normally not be + # reached and exists as a failsafe (otherwise, a KeyError would be raised) + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + def update(self, adapter_name, **kwargs): + """Called when this instance should be part of an adapter's training. + Adds the given adapter to the list of adapters that this instance is training along with. + + Additional kwargs are expected to be the same kwargs that are also passed for initializing this class. + """ + if adapter_name not in self._adapters: + self._adapters.add(adapter_name) + + def _create_new_hook(self, old_hook): + r""" + Creates a new hook based on the old hook. Use it only if you know what you are doing ! + """ + old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__) + old_hook_attr = old_hook.__dict__ + filtered_old_hook_attr = {} + old_hook_init_signature = inspect.signature(old_hook_cls.__init__) + for k in old_hook_attr.keys(): + if k in old_hook_init_signature.parameters: + filtered_old_hook_attr[k] = old_hook_attr[k] + new_hook = old_hook_cls(**filtered_old_hook_attr) + return new_hook + + def _check_forward_args(self, x, *args, **kwargs): + """Check if the arguments are compatible with the configs and state of the model""" + adapter_names = kwargs.get("adapter_names", None) + if adapter_names is None: + return + + if len(x) != len(adapter_names): + msg = ( + "Length of `adapter_names` should be the same as the number of inputs, but got " + f"{len(adapter_names)} and {len(x)} respectively." + ) + raise ValueError(msg) + + def _forward_wrapped(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + raise NotImplementedError + + def _forward_wrapped_mixed_batch( + self, x: torch.Tensor, active_adapter: str, *args: Any, **kwargs: Any + ) -> torch.Tensor: + raise NotImplementedError + + def _forward_wrapped_passthrough(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + """The forward call when no adapter is involved in the forward computation, only the base model""" + raise NotImplementedError + + def _mixed_batch_forward( + self, input: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + + SUPPORTED_MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d) + + module_names = ", ".join([module.__name__ for module in SUPPORTED_MODULES]) + + if not isinstance(self.original_module, SUPPORTED_MODULES): + raise TypeError(f"Mixed batching is only supported for the following modules: {module_names}.") + + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + results = [0 for _ in range(len(input))] + + for i, active_adapter in enumerate(unique_adapters): + sub_batch = input[sub_batch_indices_list[i]] + + if active_adapter == "__base__": + output = self.original_module(sub_batch, *args, **kwargs) + else: + output = self._forward_wrapped_mixed_batch(sub_batch, active_adapter, *args, **kwargs) + + for index, j in enumerate(sub_batch_indices_list[i]): + results[j] = output[index] + + return torch.stack(results) + + def forward(self, x: torch.Tensor, *args, **kwargs): + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + + if self.disable_adapters or any(adapter not in self._adapters for adapter in self.active_adapters): + return self._forward_wrapped_passthrough(x, *args, **kwargs) + + if adapter_names is None: + return self._forward_wrapped(x, *args, **kwargs) + return self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + + def enable_adapters(self, enabled: bool): + """Toggle the enabling and disabling of adapters + + Args: + enabled (bool): True to enable adapters, False to disable adapters + """ + if enabled: + self._disable_adapters = False + else: + self._disable_adapters = True + + def check_set_adapter(self, adapter_name: str | list[str]) -> str | None: + """Helper function to check if the given adapter(s) can be set. + + Return the name of the adapter to be set or None if no adapter should be set. + """ + raise NotImplementedError + + def set_adapter(self, adapter_names: Union[str, list[str]], inference_mode: bool = False) -> None: + """Set the active adapter + + Args: + adapter_names (str or list[str]): + The name(s) of the adapter(s) to set as active + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + if isinstance(adapter_names, str): + self._active_adapter = adapter_names + else: + self._active_adapter = [] + for adapter_name in adapter_names: + if adapter_name not in self._adapters: + raise ValueError(f"Adapter {adapter_name} not found in {self._adapters}") + + self._active_adapter.append(adapter_name) + + def delete_adapter(self, adapter_name: str, new_active_adapters: Optional[list[str]]) -> None: + """Delete an adapter from the layer, set a new active adapter if necessary""" + raise NotImplementedError + + def set_requires_grad(self, adapter_names: str | Sequence[str], requires_grad: bool = True) -> None: + """ + Enable or disable gradients on the given adapter(s). + + Args: + adapter_name (`str` or `Sequence[str]`): + The name of the adapter(s) whose gradients should be enabled/disabled. + requires_grad (`bool`, *optional*) + Whether to enable (`True`, default) or disable (`False`). + """ + if isinstance(adapter_names, str): + adapter_names_set = {adapter_names} + else: + adapter_names_set = set(adapter_names) + + for layer_name in self.adapter_layer_names: + # use attrgetter, as it resolves `.` in the attribute name + module_dict = attrgetter(layer_name)(self) + for key, layer in module_dict.items(): + if key in adapter_names_set: + layer.requires_grad_(requires_grad) + + def adapter_state_dict(self, adapter_name): + """Return the state dict of this module for a given adapter.""" + raise NotImplementedError + + def adapter_state_dict_load_map(self, adapter_name): + """Return a mapping from the key present in disk-loaded state dict + and how it should be represented in the loaded model's state dict. + + The default should be a 1:1 mapping but it is important to define a mapping as it also serves as the + ground-truth for which keys are supposed to be loaded from a saved state dict. + """ + raise NotImplementedError + + def unload_and_optionally_merge_module( + self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]] + ) -> torch.nn.Module: + """Handles unloading when called from PEFT models. Returns the wrapped module + and handles merging onto the wrapped module if requested. + """ + raise NotImplementedError + + +class ModulesToSaveWrapper(AuxiliaryTrainingWrapper): + """Wraps a module that is supposed to be trained (i.e. `requires_grad_(True)`) and saved after training.""" + + # All names of layers that may contain adapter (trainable) weights + adapter_layer_names: tuple[str, ...] = ("modules_to_save",) + + def __init__(self, module_to_save, adapter_name): + super().__init__(module_to_save, adapter_name) + + def init_modules(self, adapter_name): + # we treat each adapter separately, so we have multiple adapters, same (copied) module for each + self.modules_to_save = torch.nn.ModuleDict({}) + + def _error_message_name(self): + return "modules_to_save" + + def _forward_wrapped(self, x, *args, **kwargs): + if not self.active_adapters: + return self._forward_wrapped_passthrough(x, *args, **kwargs) + return self.modules_to_save[self.active_adapters[0]](x, *args, **kwargs) + + def _forward_wrapped_mixed_batch(self, x, active_adapter, *args, **kwargs): + return self.modules_to_save[active_adapter](x, *args, **kwargs) + + def _forward_wrapped_passthrough(self, x, *args, **kwargs): + return self.original_module(x, *args, **kwargs) + + def _hasattr_wrapped(self, name, modules): + return self.active_adapters[0] in modules["modules_to_save"] + + def _getattr_wrapped(self, name, modules): + return getattr(modules["modules_to_save"][self.active_adapters[0]], name) + + def update(self, adapter_name, **kwargs): + super().update(adapter_name) + + context_manager = nullcontext() + for _, param in self.original_module.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + import deepspeed + + context_manager = deepspeed.zero.GatheredParameters(self.original_module.parameters(), modifier_rank=0) + break + + if adapter_name not in self.modules_to_save: + with context_manager: + self.modules_to_save[adapter_name] = copy.deepcopy(self.original_module) + + if hasattr(self.modules_to_save[adapter_name], "_hf_hook"): + old_hook = self.modules_to_save[adapter_name]._hf_hook + new_hook = self._create_new_hook(old_hook) + remove_hook_from_module(self.modules_to_save[adapter_name]) + add_hook_to_module(self.modules_to_save[adapter_name], new_hook) + + self.original_module.requires_grad_(False) + + # note that there currently cannot be more than one active adapter for the same layer with modules to save + # since there would be no clear way to decide which adapter's weights are the correct ones. therefore we + # assume that there is only one active adapter. this precondition is enforced by _set_adapter. + if adapter_name == self.active_adapter: + self.modules_to_save[adapter_name].requires_grad_(True) + + def enable_adapters(self, enabled: bool): + """Takes care of setting the required_grad flag on the wrapped module. + If adapters are enabled, gradients for the module are required as well. + """ + super().enable_adapters(enabled) + + if enabled: + self.original_module.requires_grad_(False) + for adapter_name in self.active_adapters: + self.modules_to_save[adapter_name].requires_grad_(True) + else: + self.original_module.requires_grad_(True) + self.modules_to_save.requires_grad_(False) + + def check_set_adapter(self, adapter_name: str | list[str]) -> str | None: + """Helper function to check if the given adapter(s) can be set. + + Return the name of the adapter to be set or None if no adapter should be set. + """ + if isinstance(adapter_name, str): + return adapter_name + + # adapter_name is a list of str + if len(adapter_name) == 0: + raise ValueError("Please specify at least one adapter to set") + + adapter_names_in_module = [n for n in adapter_name if n in self.modules_to_save] + + if len(adapter_names_in_module) > 1: + raise ValueError(f"Only one adapter can be set at a time for {self}, got {len(adapter_names_in_module)}") + + adapter_name_to_set: str | None + if not adapter_names_in_module: + adapter_name_to_set = None + else: + adapter_name_to_set = adapter_names_in_module[0] + + return adapter_name_to_set + + def set_adapter(self, adapter_names: Union[str, list[str]], inference_mode: bool = False) -> None: + """Set the active adapter + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True) unless + inference_mode is True. + + Args: + adapter_names (list[str], str): + The name(s) of the adapter(s) to set as active. + inference_mode (bool, optional): + Whether the activated adapter should be frozen (i.e. `requires_grad=False`). Default is False. + """ + if isinstance(adapter_names, str): + adapter_names = [adapter_names] + + if len(adapter_names) > 1: + raise ValueError(f"Attempted to set multiple ({adapter_names}) adapters at once for modules_to_save.") + + if len(adapter_names) == 0: + # when calling model.add_adapter, the new adapter is not automatically active + self._active_adapter = [] + return + + adapter_name = adapter_names[0] + + if adapter_name not in self._adapters: + raise ValueError(f"Adapter {adapter_name} not found in {self._adapters}") + + for currently_active_adapter_name in self.active_adapters: + self.modules_to_save[currently_active_adapter_name].requires_grad_(False) + self.modules_to_save[adapter_name].requires_grad_(not inference_mode) + self._active_adapter = adapter_name + + def delete_adapter(self, adapter_name: str, new_active_adapters: Optional[list[str]]) -> None: + """ + Delete the adapter if present. + + This method will also set a new active adapter if the deleted adapter was the active adapter. It is important + that the new adapter is chosen by the caller in a deterministic way, so that the same adapter is chosen on all + layers. + """ + if adapter_name not in self.modules_to_save: + return + + # set new active adapter, if necessary + # note: there can only ever be one active adapter, unlike for LoRA etc. + if isinstance(new_active_adapters, (list, tuple)) and len(new_active_adapters) > 1: + name = self.__class__.__name__ + raise ValueError( + f"Attempted to set multiple ({new_active_adapters}) adapters at once for {name}, which is not allowed." + ) + + if adapter_name in self._adapters: + self._adapters.remove(adapter_name) + + if not new_active_adapters: + # no active adapter now + del self.modules_to_save[adapter_name] + self._active_adapter = [] + return + + new_active_adapter = new_active_adapters[0] + if new_active_adapter not in self.modules_to_save: + # a new active adapter was chosen but it seems like it has no modules_to_save + del self.modules_to_save[adapter_name] + self._active_adapter = [] + return + + if new_active_adapter != self.active_adapters[0]: + self.set_adapter(new_active_adapter) + del self.modules_to_save[adapter_name] + + def adapter_state_dict_load_map(self, adapter_name): + # Maps the module keys as they are in the saved state dict to the in-memory state dict. + # Must contain all keys that are supposed to be loaded. + if adapter_name not in self._adapters: + # In caes of multiple adapters, each bringing their own modules to save, each + # ModulesToSaveWrapper will be queried but not every wrapper is obliged to serve the same adapters. + return {} + return {k: f"modules_to_save.{adapter_name}.{k}" for k in self.modules_to_save[adapter_name].state_dict()} + + def adapter_state_dict(self, adapter_name, state_dict): + if adapter_name not in self._adapters: + # In caes of multiple adapters, each bringing their own modules to save, each + # ModulesToSaveWrapper will be queried but not every wrapper is obliged to serve the same adapters. + return {} + + return { + k: state_dict[f"modules_to_save.{adapter_name}.{k}"] + for k in self.modules_to_save[adapter_name].state_dict() + } + + def unload_and_optionally_merge_module( + self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]] + ) -> torch.nn.Module: + """Unloading in case of `ModulesToSave` means to simply return the wrapped module. + + However, if the wrapped module is itself a tuner, we'll call merge on it before. + """ + new_module = self.modules_to_save[self.active_adapter] + + # TODO: not sure if this is still a sensible thing to do. We would basically have to + # do the same checks as `_unload_and_optionally_merge` to support MHA, for example. + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + + return new_module + + def _get_available_adapters(self) -> set[str]: + """Return all adapter names that can be found on this module.""" + return set(self.modules_to_save.keys()) + + +class TrainableTokensWrapper(AuxiliaryTrainingWrapper): + """Wraps a module (typically an embedding layer) that is supposed to be re-trained selectively (i.e. + solely updating a few columns) using the `TrainableTokensLayer` PEFT method. + + Supports weight-tying to another adapter when passed a `tied_adapter` which is expected to be a + `TrainableTokensLayer`. + """ + + # All names of layers that may contain adapter (trainable) weights + adapter_layer_names: tuple[str, ...] = ("token_adapter.trainable_tokens_delta",) + other_param_names: tuple[str, ...] = ("token_adapter.token_indices", "token_adapter.trainable_tokens_original") + + def __init__( + self, + module_to_save: torch.nn.Module, + adapter_name: str, + token_indices: list[int], + tied_adapter=None, + ) -> None: + super().__init__(module_to_save, adapter_name, token_indices=token_indices, tied_adapter=tied_adapter) + + # unset the original_module attribute since we're using a property to remove this from the state dict. + self.original_module = None + + @property + def original_module(self): + # use a property instead of an attribute to exclude this pointer from the state dict + # to make sure that it will not be saved. + return self.token_adapter.base_layer + + def init_modules(self, adapter_name, token_indices, tied_adapter): + # use a local import to avoid potential circular imports + from peft.tuners.trainable_tokens import TrainableTokensLayer + + # since super().__init__() calls update before we have a chance to initialise the adapter we would + # need here, we do the initialization here. + self.token_adapter = TrainableTokensLayer(self.original_module, adapter_name, token_indices, tied_adapter) + + def _error_message_name(self): + return "trainable_token_indices" + + def _hasattr_wrapped(self, name, modules): + return name == "weight" + + def _getattr_wrapped(self, name, modules): + # some models query self.wte.weight.dtype, some may query the weights directly. for the first case it is not + # necessary to do anything special but we don't know if is going to be `.dtype`. so we need to get the merged + # weights from the adapter. + if name == "weight": + return modules["token_adapter"].get_merged_weights(self.token_adapter.active_adapters) + + raise RuntimeError( + f"This code should've never been reached, probably a bad check in `_hasattr_wrapped` for {name}. " + "Please file an issue under https://github.com/huggingface/peft/issues." + ) + + def _forward_wrapped(self, x, *args, **kwargs): + if not self.active_adapters: + return self._forward_wrapped_passthrough(x, *args, **kwargs) + return self.token_adapter(x) + + def _forward_wrapped_mixed_batch(self, x, active_adapter, *args, **kwargs): + return self.token_adapter.forward_adapters(x, [active_adapter]) + + def _forward_wrapped_passthrough(self, x, *args, **kwargs): + # the token adapter knows how to deal with disabled adapter / no active adapter, don't call original_module + # directly + return self.token_adapter(x, *args, **kwargs) + + def update(self, active_adapter, **kwargs): + # TODO this does not support deepspeed/fsdp since it is missing a context manager + # see ModulesToSaveWrapper implementation + if active_adapter not in self._adapters: + self.token_adapter.update_layer(active_adapter, **kwargs) + + super().update(active_adapter) + + def adapter_state_dict_load_map(self, adapter_name): + if self.token_adapter.tied_adapter: + return {} + return {"token_adapter.trainable_tokens_delta": f"token_adapter.trainable_tokens_delta.{adapter_name}"} + + def adapter_state_dict(self, adapter_name, state_dict): + if self.token_adapter.tied_adapter: + # storing of weight-tied layers is not up to us and will be handled by + # transformers. we're just here to keep those layers in sync during training. + # therefore we return an empty state dict. + return {} + + return { + f"token_adapter.{k}": state_dict[f"token_adapter.{k}.{adapter_name}"] for k in ["trainable_tokens_delta"] + } + + def enable_adapters(self, enabled: bool): + """Enables/disables the underlying `TrainableTokens` adapter. + Also handles the internal adapter disable flag. + """ + super().enable_adapters(enabled) + + self.token_adapter.enable_adapters(enabled) + + def check_set_adapter(self, adapter_name: str | list[str]) -> str | None: + """Helper function to check if the given adapter(s) can be set. + + Return the name of the adapter to be set or None if no adapter should be set. + """ + if isinstance(adapter_name, str): + return adapter_name + + # adapter_name is a list of str + if len(adapter_name) == 0: + raise ValueError("Please specify at least one adapter to set") + + # TODO In theory, multiple active trainable tokens is fine when the indices don't overlap + adapter_names_in_module = [n for n in adapter_name if n in self.token_adapter.trainable_tokens_delta] + + if len(adapter_names_in_module) > 1: + raise ValueError(f"Only one adapter can be set at a time for {self}, got {len(adapter_names_in_module)}") + + adapter_name_to_set: str | None + if not adapter_names_in_module: + adapter_name_to_set = None + else: + adapter_name_to_set = adapter_names_in_module[0] + + return adapter_name_to_set + + def set_adapter(self, adapter_names: Union[str, list[str]], inference_mode: bool = False) -> None: + super().set_adapter(adapter_names, inference_mode=inference_mode) + self.token_adapter.set_adapter(adapter_names, inference_mode=inference_mode) + + def delete_adapter(self, adapter_name: str, new_active_adapters: Optional[list[str]]) -> None: + """ + Delete the adapter if present. + + This method will also set a new active adapter if the deleted adapter was the active adapter. It is important + that the new adapter is chosen by the caller in a deterministic way, so that the same adapter is chosen on all + layers. + """ + self.token_adapter.delete_adapter(adapter_name) + + # set new active adapter, if necessary + # note: there can only ever be one active adapter, unlike for LoRA etc. + if isinstance(new_active_adapters, (list, tuple)) and len(new_active_adapters) > 1: + name = self.__class__.__name__ + raise ValueError( + f"Attempted to set multiple ({new_active_adapters}) adapters at once for {name}, which is not allowed." + ) + + if adapter_name in self._adapters: + self._adapters.remove(adapter_name) + + if not new_active_adapters: + self._active_adapter = [] + return + + if new_active_adapters[0] not in self.token_adapter.trainable_tokens_delta: + # a new active adapter was chosen but it seems like it has no trainable_tokens + self._active_adapter = [] + return + + new_active_adapter = new_active_adapters[0] + self.set_adapter(new_active_adapter) + + def unload_and_optionally_merge_module( + self, merge: bool, safe_merge: bool, adapter_names: Optional[list[str]] + ) -> torch.nn.Module: + """Unloading for `TrainableTokensWrapper` means to return the wrapped module, e.g. the embedding layer and, + if requested, merging the `TrainableTokens` adapter onto the wrapped module. + """ + if merge: + self.token_adapter.merge(safe_merge=safe_merge, adapter_names=adapter_names) + return self.token_adapter.get_base_layer() + + def _get_available_adapters(self) -> set[str]: + """Return all adapter names that can be found on this module.""" + return set(self.token_adapter.trainable_tokens_delta.keys()) + + +def _get_input_embeddings_name(model, default=None): + if not hasattr(model, "get_input_embeddings"): + return default + + input_embeddings = model.get_input_embeddings() + for name, module in model.named_modules(): + if module is input_embeddings: + return name + + return default + + +def _get_submodules(model, key): + parent = model.get_submodule(".".join(key.split(".")[:-1])) + target_name = key.split(".")[-1] + target = model.get_submodule(key) + return parent, target, target_name + + +def _get_submodules_with_grandparent(model, key): + parent = model.get_submodule(".".join(key.split(".")[:-1])) + try: + grandparent = model.get_submodule(".".join(key.split(".")[:-2])) + except AttributeError: + # no grand parent + grandparent = None + target_name = key.split(".")[-1] + target = model.get_submodule(key) + return parent, grandparent, target, target_name + + +def _freeze_adapter(model, adapter_name): + for n, p in model.named_parameters(): + if adapter_name in n: + p.requires_grad = False + + +def _set_trainable( + model, + adapter_name, + module_names, + inference_mode: bool, + strict_module_check: bool = False, + wrapper_cls: Optional[AuxiliaryTrainingWrapper] = None, + activate_adapter: bool = True, + **wrapper_kwargs, +): + """Wraps modules that are supposed to be re-trained either normally, i.e. marking them to require gradients and + saving them alongside other modules, or with certain methods that go alongside PEFT methods, such as retraining + specific token indices using selective read/write. + + Note that you need to validate beforehand if there are layers targeted by multiple wrappers, e.g. if the + 'embedding' layer is configured for both `ModulesToSaveWrapper` and `TrainableTokensWrapper` there would be + conflicts down the line. + + The default is to wrap the module in a `ModulesToSaveWrapper` wrapper. + + If `strict_module_check` is set, this method raises an ValueError, similar to BaseTuner.inject_adapter when none of + the requested modules in `module_names` is not found in the model. + + The `active_adapter` flag indicates if this new adapter should be activated. + """ + from peft.tuners.tuners_utils import BaseTunerLayer + + if wrapper_cls is None: + wrapper_cls = ModulesToSaveWrapper + + if not module_names: + # This is useful for the case that the PEFT config does not have `modules_to_save`, e.g. + # in the case of prompt tuning and friends. + return + + trainable_modules = [] + found_modules = set() + # disable removal of duplicates to support targeting tied weights + key_list = [key for key, _ in model.named_modules(remove_duplicate=False)] + + for key in key_list: + target_module_found = any(key.endswith(target_key) for target_key in module_names) + if target_module_found: + parent, grandparent, target, target_name = _get_submodules_with_grandparent(model, key) + if isinstance(grandparent, BaseTunerLayer): + # This is an extreme edge case: Let's assume that there is a PEFT config with + # modules_to_save=["default"], which is the same name as the adapter name. The PEFT method's adapter + # (e.g. LoRA) is applied first. Then, when the modules_to_save matching is performed, the LoRA layer + # would be considered a valid target. Assuming that the name is "foo.bar.lora_A.default", it would + # match, with "default" being an nn.Linear and the parent, "lora_A", being an nn.ModuleDict. This by + # itself is not enough to prove that this is an unintended match. Thererfore, we also need to check the + # grandparent, "bar", that would be a lora.LoraLayer. When we see this, we should raise an error. + raise ValueError( + f"You are trying to target a module with {wrapper_cls} that is a child of {type(grandparent)}. " + "This is almost certainly not the intended behavior. Please ensure that the adapter name, " + f"'{adapter_name}', does not conflict with any of the targeted modules." + ) + + if isinstance(target, wrapper_cls): + target.update(adapter_name, **wrapper_kwargs) + target.set_adapter(target.active_adapter, inference_mode=inference_mode) + else: + new_module = wrapper_cls(target, adapter_name, **wrapper_kwargs) + if activate_adapter: + new_module.set_adapter(adapter_name, inference_mode=inference_mode) + else: + new_module.set_adapter([], inference_mode=inference_mode) + setattr(parent, target_name, new_module) + trainable_modules.append(new_module) + found_modules.add(target_name) + + not_found = set(module_names).difference(found_modules) + if strict_module_check and not found_modules: + raise ValueError( + f"Target modules {not_found} not found in the base model. Please check the target modules and try again." + ) + + return trainable_modules + + +def _set_adapter(model, adapter_name: str | list[str], inference_mode: bool = False): + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + # only check the adapter_name if we actually encounter a AuxiliaryTrainingWrapper, otherwise we don't care + adapter_name_to_set = module.check_set_adapter(adapter_name) + + # if the adapter is found in this module, set it as the active adapter, else disable the adapters of this + # module + if adapter_name_to_set in module._adapters: + module.enable_adapters(True) + module.set_adapter(adapter_name_to_set, inference_mode=inference_mode) + else: + module.enable_adapters(False) + module.set_adapter([], inference_mode=inference_mode) + + +def _prepare_prompt_learning_config(peft_config, model_config): + # In case of VLM we focus on the language model portion of the model. + if "text_config" in model_config: + model_config = model_config["text_config"] + + if peft_config.num_layers is None: + if "num_hidden_layers" in model_config: + num_layers = model_config["num_hidden_layers"] + elif "num_layers" in model_config: + num_layers = model_config["num_layers"] + elif "n_layer" in model_config: + num_layers = model_config["n_layer"] + else: + raise ValueError("Please specify `num_layers` in `peft_config`") + peft_config.num_layers = num_layers + + if peft_config.token_dim is None: + if "hidden_size" in model_config: + token_dim = model_config["hidden_size"] + elif "n_embd" in model_config: + token_dim = model_config["n_embd"] + elif "d_model" in model_config: + token_dim = model_config["d_model"] + else: + raise ValueError("Please specify `token_dim` in `peft_config`") + peft_config.token_dim = token_dim + + if peft_config.num_attention_heads is None: + if "num_attention_heads" in model_config: + num_attention_heads = model_config["num_attention_heads"] + elif "n_head" in model_config: + num_attention_heads = model_config["n_head"] + elif "num_heads" in model_config: + num_attention_heads = model_config["num_heads"] + elif "encoder_attention_heads" in model_config: + num_attention_heads = model_config["encoder_attention_heads"] + else: + raise ValueError("Please specify `num_attention_heads` in `peft_config`") + peft_config.num_attention_heads = num_attention_heads + + # For grouped-query attention, see #1901. + if peft_config.peft_type == "PREFIX_TUNING" and "num_key_value_heads" in model_config: + num_key_value_heads = model_config["num_key_value_heads"] + peft_config.token_dim = peft_config.token_dim // peft_config.num_attention_heads * num_key_value_heads + peft_config.num_attention_heads = num_key_value_heads + + if getattr(peft_config, "encoder_hidden_size", None) is None: + setattr(peft_config, "encoder_hidden_size", peft_config.token_dim) + + return peft_config + + +def _get_no_split_modules(model) -> set[str]: + """ + Get the modules of the model that should not be split when using device_map. We iterate through the modules to get + the underlying `_no_split_modules`. + + Returns: + `List[str]`: List of modules that should not be split + """ + # After discussion in https://github.com/huggingface/transformers/pull/38141, based on: + # https://github.com/huggingface/transformers/blob/1e921a3a9cea92b383ca4b0484ee45596bbdadc3/src/transformers/modeling_utils.py#L2677-L2704 + _no_split_modules: set[str] = set() + if not hasattr(model, "_no_split_modules"): + return _no_split_modules + + modules_to_check = [model] + while len(modules_to_check) > 0: + module = modules_to_check.pop(-1) + # if the module does not appear in _no_split_modules, we also check the children + if module.__class__.__name__ not in _no_split_modules: + if isinstance(module, PreTrainedModel): + if module._no_split_modules is not None: + _no_split_modules = _no_split_modules | set(module._no_split_modules) + modules_to_check += list(module.children()) + return _no_split_modules + + +def fsdp_auto_wrap_policy(model): + if hasattr(FullyShardedDataParallelPlugin, "get_module_class_from_name"): + get_module_class_from_name = FullyShardedDataParallelPlugin.get_module_class_from_name + else: + from accelerate.utils.dataclasses import get_module_class_from_name + from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy + + from ..tuners import PrefixEncoder, PromptEmbedding, PromptEncoder + + default_transformer_cls_names_to_wrap = ",".join(_get_no_split_modules(model)) + transformer_cls_names_to_wrap = os.environ.get( + "FSDP_TRANSFORMER_CLS_TO_WRAP", default_transformer_cls_names_to_wrap + ).split(",") + transformer_cls_to_wrap = {PrefixEncoder, PromptEncoder, PromptEmbedding} + for layer_class in transformer_cls_names_to_wrap: + if len(layer_class) == 0: + continue + transformer_cls = get_module_class_from_name(model, layer_class) + if transformer_cls is None: + raise Exception("Could not find the transformer layer class to wrap in the model.") + else: + transformer_cls_to_wrap.add(transformer_cls) + + def lambda_policy_fn(module): + if ( + len(list(module.named_children())) == 0 + and getattr(module, "weight", None) is not None + and module.weight.requires_grad + ): + return True + return False + + lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) + transformer_wrap_policy = functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls=transformer_cls_to_wrap, + ) + + auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy]) + return auto_wrap_policy + + +def transpose(weight, fan_in_fan_out): + if not fan_in_fan_out: + return weight + + if isinstance(weight, torch.nn.Parameter): + return torch.nn.Parameter(weight.T) + return weight.T + + +def _is_valid_match(key: str, target_key: str): + """ + Helper function to match module names target_key and key. Makes sure that either the key is exactly the target_key + or the target_key is a submodule of key + """ + if key.endswith(target_key): + if len(key) > len(target_key): + return key.endswith("." + target_key) # must be a sub module + return True + return False + + +def _get_batch_size(input_ids: Optional[torch.Tensor], inputs_embeds: Optional[torch.Tensor]) -> int: + """Get the batch size based on either input_ids or input_embeds + + Raises an ValueError if both are None. + + """ + if (input_ids is None) and (inputs_embeds is None): + raise ValueError("You have to provide either input_ids or inputs_embeds") + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + return batch_size + + +def get_quantization_config(model: torch.nn.Module, method: str): + """ + Get the quantization config of the related quantization method + """ + if ( + hasattr(model, "config") + and hasattr(model.config, "quantization_config") + and (getattr(model, "quantization_method", None) == method) + ): + return model.config.quantization_config + return None + + +def get_auto_gptq_quant_linear(gptq_quantization_config): + """ + Get the right AutoGPTQQuantLinear class based on the quantization config file + """ + if gptq_quantization_config is None: + return None + + if is_auto_gptq_available(): + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear + else: + return None + + desc_act = gptq_quantization_config.desc_act + group_size = gptq_quantization_config.group_size + bits = gptq_quantization_config.bits + if hasattr(gptq_quantization_config, "use_exllama"): + use_exllama = gptq_quantization_config.use_exllama + else: + use_exllama = not gptq_quantization_config.disable_exllama + if hasattr(gptq_quantization_config, "exllama_config"): + exllama_version = gptq_quantization_config.exllama_config["version"] + else: + exllama_version = 1 + + QuantLinear = dynamically_import_QuantLinear( + use_triton=False, + desc_act=desc_act, + group_size=group_size, + bits=bits, + disable_exllama=not (use_exllama and exllama_version == 1), + disable_exllamav2=not (use_exllama and exllama_version == 2), + ) + + return QuantLinear + + +def get_gptqmodel_quant_linear(gptq_quantization_config, device_map=None): + """ + Get the right GPTQQuantLinear class based on the quantization config file + """ + if gptq_quantization_config is None: + return None + + if not is_gptqmodel_available(): + return None + + from gptqmodel.utils.importer import hf_select_quant_linear + + desc_act = gptq_quantization_config.desc_act + group_size = gptq_quantization_config.group_size + bits = gptq_quantization_config.bits + checkpoint_format = ( + gptq_quantization_config.checkpoint_format + if hasattr(gptq_quantization_config, "checkpoint_format") + else "gptq" + ) + sym = gptq_quantization_config.sym + meta = gptq_quantization_config.meta if hasattr(gptq_quantization_config, "meta") else None + + QuantLinear = hf_select_quant_linear( + bits=bits, + group_size=group_size, + desc_act=desc_act, + sym=sym, + device_map=device_map, + checkpoint_format=checkpoint_format, + meta=meta, + backend="auto_trainable", + ) + + return QuantLinear + + +def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]: + """ + Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For + example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is + guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with + non-overlapping lifetimes may have the same id. + + This method is the exact same copy of + https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added + it here manually to avoid import issue with old versions of transformers. + """ + if tensor.device.type == "xla" and is_torch_tpu_available(): + # NOTE: xla tensors dont have storage + # use some other unique id to distinguish. + # this is a XLA tensor, it must be created using torch_xla's + # device. So the following import is safe: + import torch_xla + + unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor) + else: + unique_id = storage_ptr(tensor) + + return tensor.device, unique_id, storage_size(tensor) + + +def cast_mixed_precision_params(model, dtype): + """ + Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or + `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to full + precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for + non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using + automatic mixed-precision training. + + Args: + model (`torch.nn.Module`): + The model to cast the non-trainable parameters of. + dtype (`torch.dtype`): + The dtype to cast the non-trainable parameters to. The `dtype` can be `torch.float16` or + `torch.bfloat16` as per the mixed-precision training you are performing. + """ + for p in model.parameters(): + if not p.requires_grad: + p.data = p.to(dtype) + else: + p.data = p.to(torch.float32) + + +def str_to_bool(value: str) -> int: + """ + Converts a string representation of truth to `True` (1) or `False` (0). + + True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; + """ + # same as function as in accelerate.utils, which replaces the deprecated distutils.util.strtobool + value = value.lower() + if value in ("y", "yes", "t", "true", "on", "1"): + return 1 + elif value in ("n", "no", "f", "false", "off", "0"): + return 0 + else: + raise ValueError(f"invalid truth value {value}") + + +def check_file_exists_on_hf_hub(repo_id: str, filename: str, **kwargs) -> Optional[bool]: + """Check if a file exists on HF Hub, if check was not successful returns None instead of erroring. + + Respect offline mode if set. + + """ + exists: Optional[bool] = None + if str_to_bool(os.environ.get("HF_HUB_OFFLINE", "0")): + # user set offline mode, cannot check + return exists + + try: + exists = file_exists(repo_id, filename, **kwargs) + except (HFValidationError, EntryNotFoundError): + # error, exists stays None + pass + except Exception as e: + warnings.warn( + f"Unable to fetch remote file due to the following error {e} - silently ignoring the lookup" + f" for the file {filename} in {repo_id}." + ) + + return exists + + +def match_target_against_key(target_pattern: str, key: str): + """Backing function for `target_modules` config parameter. + + Having this as its own function ensures that target key matching can be implemented in the same way everywhere. + """ + return re.fullmatch(target_pattern, key) + + +def get_pattern_key(pattern_keys: Sequence[str], key_to_match: str) -> str: + """Match a substring of key_to_match in pattern keys""" + for key in pattern_keys: + match = re.match(rf"(.*\.)?({key})$", key_to_match) + if not match: + continue + return key + + return key_to_match + + +def set_additional_trainable_modules(model, peft_config, model_config, adapter_name, activate_adapter: bool = True): + """Handle the resolution of additional trainable modules (also called AuxiliaryTrainingWrapper) + by checking the config if such modules are requested and adding them to the model. + + Currently trainable tokens and modules to save are considered additional trainable modules. + + If `activate_adapter` is set to `False`, the adapter won't be activated. This is typically the case when + `model.add_adapter` or `model.load_adapter` are being called. + """ + if getattr(peft_config, "modules_to_save", None) is not None: + # this may add a new ModulesToSaveWrapper + _set_trainable( + model, + adapter_name, + inference_mode=peft_config.inference_mode, + module_names=getattr(peft_config, "modules_to_save", None), + activate_adapter=activate_adapter, + ) + + if getattr(peft_config, "trainable_token_indices", None) is not None: + if isinstance(peft_config.trainable_token_indices, dict): + target_layers = peft_config.trainable_token_indices + else: + layer_name = _get_input_embeddings_name(model, "embed_tokens") + target_layers = {layer_name: peft_config.trainable_token_indices} + + modules_to_save = getattr(peft_config, "modules_to_save", None) + if modules_to_save is not None: + for target_layer in target_layers: + if target_layer in modules_to_save: + raise ValueError( + "The embedding layer is already marked to be trained fully, either specify " + f'`modules_to_save=[..., "{target_layer}", ...]` or ' + f"`trainable_tokens={{'{target_layer}': x}}` but not both." + ) + + for target_layer, token_indices in target_layers.items(): + _set_trainable( + model, + adapter_name, + inference_mode=peft_config.inference_mode, + module_names=[target_layer], + strict_module_check=True, + wrapper_cls=TrainableTokensWrapper, + token_indices=token_indices, + activate_adapter=activate_adapter, + ) + + # There might be the possibility that we have output weights that are tied to the input weights. + # In that case we will tie any module that wants tied weights to the token adapter to make sure that + # any modification is reflected in the tied layers as well. + if ( + model_config.get("tie_word_embeddings", False) + # some models may be misconfigured to have weight tying enabled but don't define tied weights keys + and model._tied_weights_keys is not None + and isinstance(model.get_input_embeddings(), TrainableTokensWrapper) + ): + # the embedding layer is modified and we want weight tying. + module_keys = [".".join(n.split(".")[:-1]) for n in model._tied_weights_keys] + + token_adapter = model.get_input_embeddings().token_adapter + _set_trainable( + model, + adapter_name, + inference_mode=peft_config.inference_mode, + module_names=module_keys, + strict_module_check=True, + wrapper_cls=TrainableTokensWrapper, + token_indices=token_adapter.token_indices[adapter_name], + tied_adapter=model.get_input_embeddings().token_adapter, + ) + + +def create_attention_mask( + model, *, model_input, attention_mask, past_key_values, cache_position, batch_size, sequence_length, position_ids +): + # adapted from: + # https://github.com/huggingface/transformers/blob/cb4c56ce0dfa1350267ed28e57760986a58a9ba4/src/transformers/generation/utils.py#L644-L680 + # In PEFT, we sometimes need to re-create the attention mask. This is because some prompt learning methods insert + # new items into the sequence, which results in the attention mask needing an update. We re-use transformers code + # for this as much as possible. + transformers_ge_4_53_1 = version.parse(transformers.__version__) >= version.parse("4.53.1") + if transformers_ge_4_53_1: + # the function already exists in v4.53.0 but has a different signature, so we check for 4.53.1 + from transformers.masking_utils import create_masks_for_generate + else: + raise ImportError("Your transformers version is too old, please upgrade it to >= 4.53.1") + + # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create + # the 4D causal mask exists, it should be present in the base model (XXXModel class) or in its decoder. + base_model = getattr(model, model.base_model_prefix, model) + decoder = base_model.get_decoder() if hasattr(base_model, "get_decoder") else None + causal_mask_creation_function = getattr(base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None) + if causal_mask_creation_function is None and decoder is not None: # it may be in the decoder + causal_mask_creation_function = getattr(decoder, "_prepare_4d_causal_attention_mask_with_cache_position", None) + + # If it's not defined, it means the model uses the new general mask API + if causal_mask_creation_function is None: # can't be found + token_type_ids = getattr(model_input, "token_type_ids", None) + # Some models may overwrite the general one + causal_mask_creation_function = getattr(model, "create_masks_for_generate", create_masks_for_generate) + attention_mask = causal_mask_creation_function( + config=model.config, + # we only need batch size, seq_length and dtype here - we don't care about the values of the embeddings + input_embeds=torch.empty((batch_size, sequence_length), dtype=model.dtype), + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + token_type_ids=token_type_ids, + position_ids=position_ids, + ) + else: + attention_mask = causal_mask_creation_function( + attention_mask, + sequence_length=sequence_length, + target_length=past_key_values.get_max_cache_shape(), + dtype=model.dtype, + cache_position=cache_position, + batch_size=batch_size, + config=model.config, + past_key_values=past_key_values, + position_ids=position_ids, + ) + return attention_mask diff --git a/peft/src/peft/utils/peft_types.py b/peft/src/peft/utils/peft_types.py new file mode 100644 index 0000000000000000000000000000000000000000..8815aa4684a331fcb53f199beffa9fa96e153e60 --- /dev/null +++ b/peft/src/peft/utils/peft_types.py @@ -0,0 +1,177 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from typing import Optional + + +class PeftType(str, enum.Enum): + """ + Enum class for the different types of adapters in PEFT. + + Supported PEFT types: + - PROMPT_TUNING + - MULTITASK_PROMPT_TUNING + - P_TUNING + - PREFIX_TUNING + - LORA + - ADALORA + - BOFT + - ADAPTION_PROMPT + - IA3 + - LOHA + - LOKR + - OFT + - XLORA + - POLY + - LN_TUNING + - VERA + - FOURIERFT + - HRA + - BONE + - MISS + - RANDLORA + - SHIRA + - C3A + - ROAD + - WAVEFT + """ + + PROMPT_TUNING = "PROMPT_TUNING" + MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING" + P_TUNING = "P_TUNING" + PREFIX_TUNING = "PREFIX_TUNING" + LORA = "LORA" + ADALORA = "ADALORA" + BOFT = "BOFT" + ADAPTION_PROMPT = "ADAPTION_PROMPT" + IA3 = "IA3" + LOHA = "LOHA" + LOKR = "LOKR" + OFT = "OFT" + POLY = "POLY" + LN_TUNING = "LN_TUNING" + VERA = "VERA" + FOURIERFT = "FOURIERFT" + XLORA = "XLORA" + HRA = "HRA" + VBLORA = "VBLORA" + CPT = "CPT" + BONE = "BONE" + MISS = "MISS" + RANDLORA = "RANDLORA" + ROAD = "ROAD" + TRAINABLE_TOKENS = "TRAINABLE_TOKENS" + SHIRA = "SHIRA" + C3A = "C3A" + WAVEFT = "WAVEFT" + + +class TaskType(str, enum.Enum): + """ + Enum class for the different types of tasks supported by PEFT. + + Overview of the supported task types: + - SEQ_CLS: Text classification. + - SEQ_2_SEQ_LM: Sequence-to-sequence language modeling. + - CAUSAL_LM: Causal language modeling. + - TOKEN_CLS: Token classification. + - QUESTION_ANS: Question answering. + - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as embeddings or features + for downstream tasks. + """ + + SEQ_CLS = "SEQ_CLS" + SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM" + CAUSAL_LM = "CAUSAL_LM" + TOKEN_CLS = "TOKEN_CLS" + QUESTION_ANS = "QUESTION_ANS" + FEATURE_EXTRACTION = "FEATURE_EXTRACTION" + + +def register_peft_method( + *, name: str, config_cls, model_cls, prefix: Optional[str] = None, is_mixed_compatible=False +) -> None: + """ + Function to register a finetuning method like LoRA to be available in PEFT. + + This method takes care of registering the PEFT method's configuration class, the model class, and optionally the + prefix. + + Args: + name (str): + The name of the PEFT method. It must be unique. + config_cls: + The configuration class of the PEFT method. + model_cls: + The model class of the PEFT method. + prefix (Optional[str], optional): + The prefix of the PEFT method. It should be unique. If not provided, the name of the PEFT method is used as + the prefix. + is_mixed_compatible (bool, optional): + Whether the PEFT method is compatible with `PeftMixedModel`. If you're not sure, leave it as False + (default). + + Example: + + ```py + # inside of peft/tuners/my_peft_method/__init__.py + from peft.utils import register_peft_method + + register_peft_method(name="my_peft_method", config_cls=MyConfig, model_cls=MyModel) + ``` + """ + from peft.mapping import ( + PEFT_TYPE_TO_CONFIG_MAPPING, + PEFT_TYPE_TO_MIXED_MODEL_MAPPING, + PEFT_TYPE_TO_PREFIX_MAPPING, + PEFT_TYPE_TO_TUNER_MAPPING, + ) + + if name.endswith("_"): + raise ValueError(f"Please pass the name of the PEFT method without '_' suffix, got {name}.") + + if not name.islower(): + raise ValueError(f"The name of the PEFT method should be in lower case letters, got {name}.") + + if name.upper() not in list(PeftType): + raise ValueError(f"Unknown PEFT type {name.upper()}, please add an entry to peft.utils.peft_types.PeftType.") + + peft_type = getattr(PeftType, name.upper()) + + # model_cls can be None for prompt learning methods, which don't have dedicated model classes + if prefix is None: + prefix = name + "_" + + if ( + (peft_type in PEFT_TYPE_TO_CONFIG_MAPPING) + or (peft_type in PEFT_TYPE_TO_TUNER_MAPPING) + or (peft_type in PEFT_TYPE_TO_MIXED_MODEL_MAPPING) + ): + raise KeyError(f"There is already PEFT method called '{name}', please choose a unique name.") + + if prefix in PEFT_TYPE_TO_PREFIX_MAPPING: + raise KeyError(f"There is already a prefix called '{prefix}', please choose a unique prefix.") + + model_cls_prefix = getattr(model_cls, "prefix", None) + if (model_cls_prefix is not None) and (model_cls_prefix != prefix): + raise ValueError( + f"Inconsistent prefixes found: '{prefix}' and '{model_cls_prefix}' (they should be the same)." + ) + + PEFT_TYPE_TO_PREFIX_MAPPING[peft_type] = prefix + PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] = config_cls + PEFT_TYPE_TO_TUNER_MAPPING[peft_type] = model_cls + if is_mixed_compatible: + PEFT_TYPE_TO_MIXED_MODEL_MAPPING[peft_type] = model_cls diff --git a/peft/src/peft/utils/save_and_load.py b/peft/src/peft/utils/save_and_load.py new file mode 100644 index 0000000000000000000000000000000000000000..778e6964103c44215abc0c39fd57e93d7065eacc --- /dev/null +++ b/peft/src/peft/utils/save_and_load.py @@ -0,0 +1,724 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import os +import platform +import re +import warnings +from typing import Optional + +import huggingface_hub +import torch +from huggingface_hub import file_exists, hf_hub_download +from huggingface_hub.errors import EntryNotFoundError, LocalEntryNotFoundError +from safetensors.torch import load_file as safe_load_file +from transformers.utils import http_user_agent + +from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING + +from .constants import INCLUDE_LINEAR_LAYERS_SHORTHAND +from .other import ( + EMBEDDING_LAYER_NAMES, + SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_NAME, + AuxiliaryTrainingWrapper, + check_file_exists_on_hf_hub, + infer_device, + match_target_against_key, +) +from .peft_types import PeftType + + +def has_valid_embedding_base_layer(layer): + """Check if the layer has an embedding base layer""" + return hasattr(layer, "base_layer") and isinstance(layer.base_layer, (torch.nn.Linear, torch.nn.Embedding)) + + +def get_embedding_layer_name(model, layer, is_embedding_in_target_modules): + """Get the name of the embedding module for a given layer.""" + for name, module in model.named_modules(): + if (not is_embedding_in_target_modules and module == layer) or module == getattr(layer, "base_layer", None): + return name + return None + + +def get_peft_model_state_dict( + model, state_dict=None, adapter_name="default", unwrap_compiled=False, save_embedding_layers="auto" +): + """ + Get the state dict of the given adapter of the PEFT model. + + This only includes the PEFT parameters, not the parameters of the base model. Thus the returned `state_dict` is + generally small compared to the full model size. To retrieve the full `state_dict`, just call `model.state_dict()`. + + Note that the adapter name is removed from the `state_dict`, as this is just an arbitrary name that can be changed + when loading the adapter. So e.g. if the adapter name is `'default'` and the original key is + `'model.q_proj.lora_A.default.weight'`, the returned key will be `'model.q_proj.lora_A.weight'`. Use this function + in conjunction with [`set_peft_model_state_dict`] to take care of the adapter name when loading weights. + + Args: + model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP, + the model should be the underlying model/unwrapped model (i.e. model.module). + state_dict (`dict`, *optional*, defaults to `None`): + The state dict of the model. If not provided, the state dict of the passed model will be used. + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter whose state dict should be returned. + unwrap_compiled (`bool`, *optional*, defaults to `False`): + Whether to unwrap the model if torch.compile was used. + save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`): + If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common embedding + layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. Based on it + sets the boolean flag. This only works for 🤗 transformers models. + + """ + if unwrap_compiled: + model = getattr(model, "_orig_mod", model) + + config = model.peft_config[adapter_name] + if state_dict is None: + state_dict = model.state_dict() + + # TUNER SPECIFIC CODE + if config.peft_type in (PeftType.LORA, PeftType.ADALORA): + # to_return = lora_state_dict(model, bias=model.peft_config.bias) + # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py` + # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP + bias = config.bias + if bias == "none": + to_return = {k: state_dict[k] for k in state_dict if "lora_" in k} + elif bias == "all": + to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + for k in state_dict: + if "lora_" in k: + to_return[k] = state_dict[k] + bias_name = k.split("lora_")[0] + "bias" + if bias_name in state_dict: + to_return[bias_name] = state_dict[bias_name] + else: + raise NotImplementedError + to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))} + if config.peft_type == PeftType.ADALORA: + rank_pattern = config.rank_pattern + if rank_pattern is not None: + rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()} + config.rank_pattern = rank_pattern + to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name) + + if config.use_dora: + # Here we take care of a refactor of DoRA which changed lora_magnitude_vector from a ParameterDict to a + # ModuleDict with a DoraLayer instance. The old parameter is now the "weight" attribute of that layer. Since + # we want the state_dict format not to change, we remove the "weight" part. + new_dora_suffix = f"lora_magnitude_vector.{adapter_name}.weight" + + def renamed_dora_weights(k): + if k.endswith(new_dora_suffix): + k = k[:-7] # remove ".weight" + return k + + to_return = {renamed_dora_weights(k): v for k, v in to_return.items()} + + elif config.peft_type == PeftType.BOFT: + bias = config.bias + if bias == "none": + to_return = {k: state_dict[k] for k in state_dict if "boft_" in k} + elif bias == "all": + to_return = {k: state_dict[k] for k in state_dict if "boft_" in k or "bias" in k} + elif bias == "boft_only": + to_return = {} + for k in state_dict: + if "boft_" in k: + to_return[k] = state_dict[k] + bias_name = k.split("boft_")[0] + "bias" + if bias_name in state_dict: + to_return[bias_name] = state_dict[bias_name] + else: + raise NotImplementedError + + elif config.peft_type == PeftType.ADAPTION_PROMPT: + to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")} + + elif config.is_prompt_learning: + to_return = {} + if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + to_return["prefix_task_cols"] = model.prompt_encoder[adapter_name].prefix_task_cols + to_return["prefix_task_rows"] = model.prompt_encoder[adapter_name].prefix_task_rows + prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight + else: + if config.inference_mode: + prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight + else: + prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name) + to_return["prompt_embeddings"] = prompt_embeddings + + elif config.peft_type == PeftType.SHIRA: + shira_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + to_return = {k: state_dict[k] for k in state_dict if shira_prefix in k} + if platform.system() == "Windows": + warnings.warn( + "Windows has issues saving integers into safetensors. Hence, we convert shira_indices to float32 " + "before saving on Windows OS. The shira_indices will always be converted to integers when loading." + ) + for name, module in model.named_modules(): + if hasattr(module, "shira_indices"): + for k, v in module.shira_indices.items(): + # Windows has some issues with saving integers into safetensors. Tests fail with some kind of + # PermissionError. This results in failed tests, so we are converting indices to float32 before + # saving and then converting them back to int when loading. This is happening only for Windows, + # not for Linux and Mac-OS. + to_return[f"{name}.shira_indices.{k}"] = ( + v.to(torch.float32) if platform.system() == "Windows" else v + ) + + elif config.peft_type == PeftType.VERA: + vera_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + to_return = {k: state_dict[k] for k in state_dict if vera_prefix in k} + if config.save_projection: + # TODO: adding vera_A and vera_B to `self.get_base_layer` would + # make name to match here difficult to predict. + if f"base_model.vera_A.{adapter_name}" not in state_dict: + raise ValueError( + "Model was initialised to not save vera_A and vera_B but config now specifies to save projection!" + " Set `config.save_projection` to `False`." + ) + to_return["base_model.vera_A." + adapter_name] = state_dict["base_model.vera_A." + adapter_name] + to_return["base_model.vera_B." + adapter_name] = state_dict["base_model.vera_B." + adapter_name] + elif config.peft_type == PeftType.XLORA: + to_return = {k: state_dict[k] for k in state_dict if "internal_xlora_classifier" in k} + elif config.peft_type == PeftType.VBLORA: + to_return = {} + # choose the most efficient dtype for indices + if config.num_vectors < 2**8: + indices_dtype = torch.uint8 + elif config.num_vectors < 2**15: + indices_dtype = torch.int16 + elif config.num_vectors < 2**31: + indices_dtype = torch.int32 + else: + indices_dtype = torch.int64 + if config.save_only_topk_weights: + # in save_only_topk_weights mode, we save topk_indices and topk_weights for parameter efficiency + for k in state_dict: + if "vblora_logits" in k: + logits, indices = state_dict[k].topk(config.topk) + to_return.update({k + "_topk_indices": indices.to(dtype=indices_dtype)}) + to_return.update({k + "_topk_weights": torch.softmax(logits, dim=-1)[:, :, :-1].contiguous()}) + else: + to_return = {k: state_dict[k] for k in state_dict if "vblora_logits" in k} + to_return["base_model.vblora_vector_bank." + adapter_name] = state_dict[ + "base_model.vblora_vector_bank." + adapter_name + ] + elif config.peft_type in list(PeftType): + prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + to_return = {k: state_dict[k] for k in state_dict if prefix in k} + else: + raise ValueError(f"Unknown PEFT type passed: {config.peft_type}") + + # ADDITIONAL TRAINING MODULES / MODULES_TO_SAVE + for name, module in model.named_modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + if name.startswith("_fsdp_wrapped_module."): + # If FSDP is used, the state_dict is from the unwrapped model, which will result in a key mismatch if we + # don't remove the FSDP-specific prefix + name = name.removeprefix("_fsdp_wrapped_module.") + # Compute the module-relative state dict to make it easier for the adapter to fetch the appropriate + # keys that the module thinks need to be saved. We cannot rely on `.state_dict()` internally of the + # module since accelerators like DeepSpeed require special handling which is done for the model + # state dict from above but most likely not in the module itself. See #2450. + module_state_dict = { + k.removeprefix(f"{name}."): v for k, v in state_dict.items() if k.startswith(f"{name}.") + } + to_return.update( + {f"{name}.{k}": v for k, v in module.adapter_state_dict(adapter_name, module_state_dict).items()} + ) + + # DEAL WITH EMBEDDINGS + # + # save_embedding_layer="auto" needs to check the following logic: + # + # - when vocab size was NOT changed, embeddings should be saved only when targeted + # but not when + # - using PeftType.TRAINABLE_TOKENS + # - LoRA using trainable_token_indices (since their goal is to space-efficient) + # but + # - when vocab size was changed, embeddings should be saved automatically regardless to cover this + # scenario: 1) fine-tune embedding, 2) resize embedding, 3) train with trainable tokens + # + embedding_is_targeted = False + if hasattr(config, "target_modules"): + if isinstance(config.target_modules, str) and (config.target_modules != INCLUDE_LINEAR_LAYERS_SHORTHAND): + # `model` could be a PeftModel or something else like transformers/diffusers/..., in which case unwrapping is + # not needed. + _model = model.get_base_model() if hasattr(model, "get_base_model") else model + embedding_is_targeted = any( + match_target_against_key(config.target_modules, k) + for k, _ in _model.named_modules() + if any(re.match(rf"(.*\.)?{e}$", k) for e in EMBEDDING_LAYER_NAMES) + ) + elif config.target_modules: + embedding_is_targeted = any(k in config.target_modules for k in EMBEDDING_LAYER_NAMES) + + using_trainable_tokens = ( + config.peft_type == PeftType.TRAINABLE_TOKENS or getattr(config, "trainable_token_indices", None) is not None + ) + + if save_embedding_layers == "auto" and embedding_is_targeted and not using_trainable_tokens: + warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.") + save_embedding_layers = True + elif save_embedding_layers == "auto": + vocab_size = getattr(getattr(model, "config", None), "vocab_size", None) + model_id = getattr(config, "base_model_name_or_path", None) + + # For some models e.g. diffusers the text config file is stored in a subfolder + # we need to make sure we can download that config. + has_base_config = False + + # ensure that this check is not performed in HF offline mode, see #1452 + if model_id is not None: + local_config_exists = os.path.exists(os.path.join(model_id, "config.json")) + exists = local_config_exists or check_file_exists_on_hf_hub(model_id, "config.json") + if exists is None: + # check failed, could not determine if it exists or not + warnings.warn( + f"Could not find a config file in {model_id} - will assume that the vocabulary was not modified." + ) + has_base_config = False + else: + has_base_config = exists + + # check if the vocab size of the base model is different from the vocab size of the finetuned model + if ( + vocab_size + and model_id + and has_base_config + and (vocab_size != model.config.__class__.from_pretrained(model_id).vocab_size) + ): + warnings.warn( + "Setting `save_embedding_layers` to `True` as the embedding layer has been resized during finetuning." + ) + save_embedding_layers = True + else: + save_embedding_layers = False + + if save_embedding_layers and hasattr(model, "get_input_embeddings"): + for layer in [model.get_input_embeddings(), model.get_output_embeddings()]: + # Either the layer is not targeted, then it must have been resized and needs saving. Or it is targeted and + # therefore has a valid base layer, then we'll save it as well. + if not embedding_is_targeted or has_valid_embedding_base_layer(layer): + embedding_module_name = get_embedding_layer_name(model, layer, embedding_is_targeted) + if embedding_module_name: + to_return.update({k: v for k, v in state_dict.items() if embedding_module_name in k}) + elif save_embedding_layers: + warnings.warn("Could not identify embedding layer(s) because the model is not a 🤗 transformers model.") + + # REMOVE ADAPTER NAME + # Ensure not to replace in the middle of the key because a module happens to have the same name as the adapter. + pattern = re.compile(re.escape(f".{adapter_name}") + r"$") + + def remove_adapter_name(key): + if "." not in key: + # nothing to do + return key + + if key.endswith(f".{adapter_name}"): + # comes from an nn.Parameter, so no .weight suffix, the adapter name is directly at the end + return key.removesuffix(f".{adapter_name}") + + # comes from an nn.Module, i.e. the adapter name is the 2nd to last element, e.g. v_proj.lora_A.default.weight + key, _, suffix = key.rpartition(".") # split, e.g. v_proj.lora_A.default + weight + + if (config.peft_type == PeftType.VBLORA) and suffix.startswith(f"{adapter_name}_"): + # special case: VBLoRA creates keys that require this replacement: + # base_model.model.lin0.vblora_logits_A.default_topk_indices => + # base_model.model.lin0.vblora_logits_A_topk_indices + return key + "_" + suffix.removeprefix(f"{adapter_name}_") + + key = pattern.sub("", key) # remove adapter name, e.g. v_proj.lora_A + return f"{key}.{suffix}" # stitch the suffix back, e.g, v_proj.lora_A.weight + + to_return = {remove_adapter_name(k): v for k, v in to_return.items()} + return to_return + + +def _find_mismatched_keys( + model: torch.nn.Module, peft_model_state_dict: dict[str, torch.Tensor], ignore_mismatched_sizes: bool = False +) -> tuple[dict[str, torch.Tensor], list[tuple[str, tuple[int, ...], tuple[int, ...]]]]: + if not ignore_mismatched_sizes: + return peft_model_state_dict, [] + + mismatched = [] + state_dict = model.state_dict() + for key, tensor in peft_model_state_dict.items(): + if key not in state_dict: + continue + + # see https://github.com/huggingface/transformers/blob/09f9f566de83eef1f13ee83b5a1bbeebde5c80c1/src/transformers/modeling_utils.py#L3858-L3864 + if (state_dict[key].shape[-1] == 1) and (state_dict[key].numel() * 2 == tensor.numel()): + # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size + # differences. Without matching with module type or parameter type it seems like a practical way to detect + # valid 4bit weights. + continue + + if state_dict[key].shape != tensor.shape: + mismatched.append((key, tensor.shape, state_dict[key].shape)) + + for key, _, _ in mismatched: + del peft_model_state_dict[key] + + return peft_model_state_dict, mismatched + + +def _insert_adapter_name_into_state_dict( + state_dict: dict[str, torch.Tensor], adapter_name: str, parameter_prefix: str +) -> dict[str, torch.Tensor]: + """Utility function to remap the state_dict keys to fit the PEFT model by inserting the adapter name.""" + peft_model_state_dict = {} + for key, val in state_dict.items(): + if parameter_prefix in key: + _, _, suffix = key.rpartition(parameter_prefix) + if "." in suffix: + suffix_to_replace = ".".join(suffix.split(".")[1:]) + # only replace the substring if the key ends on the substring to avoid accidental replacement inside of + # the key if a module happens to have a name that contains the substring + key = re.sub(re.escape(suffix_to_replace) + r"$", f"{adapter_name}.{suffix_to_replace}", key) + else: + key = f"{key}.{adapter_name}" + peft_model_state_dict[key] = val + else: + peft_model_state_dict[key] = val + return peft_model_state_dict + + +def set_peft_model_state_dict( + model, + peft_model_state_dict, + adapter_name="default", + ignore_mismatched_sizes: bool = False, + low_cpu_mem_usage: bool = False, +) -> None: + """ + Set the state dict of the PEFT model. + + Given a PEFT `state_dict` (as returned by [`get_peft_model_state_dict`]), insert the weights into the model. The + model needs to have the PEFT adapters already in place (e.g. via [`inject_adapter_in_model`]). + + Setting the adapter weights also takes care of re-inserting the adapter name. This name may be a different name + than the one originally used to train the adapter. + + Args: + model ([`PeftModel`]): + The Peft model. + peft_model_state_dict (`dict`): + The state dict of the Peft model. + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter whose state dict should be set. + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): + Whether to ignore mismatched in the state dict. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + This argument must be `True` if the `model` was loaded with adapter weights on the meta device, e.g. after + calling `inject_adapter_in_model` with `low_cpu_mem_usage=True`. Otherwise, leave it as `False`. + + """ + config = model.peft_config[adapter_name] + state_dict = peft_model_state_dict + + # handle auxiliary training wrappers such as ModulesToSaveWrapper and TrainableTokensWrapper by getting each of + # them and translating saved state dict key (which does not include the adapter name) to loaded state dict key + # (which includes the adapter name). + for name, module in model.named_modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + # Not every module has a 1:1 mapping. ModulesToSaveWrapper, for example, removes the + # `modules_to_save.{adapter_name}.` prefix. This prefix must be restored when loading the model from the + # saved state dict which is why we fetch a load key map from the wrapper. + key_map = module.adapter_state_dict_load_map(adapter_name) + if name.startswith("_fsdp_wrapped_module."): + # If FSDP is used, the state_dict is from the unwrapped model, which will result in a key mismatch if we + # don't remove the FSDP-specific prefix + name = name.removeprefix("_fsdp_wrapped_module.") + for k in key_map: + lookup_key = f"{name}.{k}" + store_key = f"{name}.{key_map[k]}" + + state_dict[store_key] = peft_model_state_dict[lookup_key] + + # delete the old key from the previous `state_dict = peft_model_state_dict` statement. + del state_dict[lookup_key] + + if config.is_prompt_learning or config.peft_type == PeftType.ADAPTION_PROMPT: + peft_model_state_dict = state_dict + elif config.peft_type == PeftType.XLORA: + peft_model_state_dict = state_dict + elif config.peft_type in PEFT_TYPE_TO_PREFIX_MAPPING: + peft_model_state_dict = {} + parameter_prefix = PEFT_TYPE_TO_PREFIX_MAPPING[config.peft_type] + if config.peft_type == PeftType.VBLORA and config.save_only_topk_weights: + num_vectors, _ = model.vblora_vector_bank[adapter_name].shape + state_dict_keys = list(state_dict.keys()) + for k in state_dict_keys: + # in save_only_topk_weights mode, only topk_indices and topk_weights are saved + # note that topk_indices and topk_weights serve as an efficient representation of the logits + # so we need to recover the logits from the topk_indices and topk_weights + if "_topk_indices" in k: + v = state_dict[k].to(torch.long) + original_key = k.replace("_topk_indices", "") + # find the corresponding topk_weights from the state_dict + topk_weights = state_dict[k.replace("_topk_indices", "_topk_weights")] + # as we only save the first k-1 topk_weights, here we recover the last one + topk_weights = torch.cat([topk_weights, 1 - topk_weights.sum(-1, keepdim=True)], dim=-1) + # convert the weights to logits + topk_logits = torch.log(topk_weights) + matrix = ( + torch.zeros([*(topk_logits.shape[:-1]), num_vectors]) + .fill_(float("-inf")) + .to(topk_logits.device) + .scatter(-1, v, topk_logits) + ) + # add logits to the state_dict + state_dict[original_key] = matrix + # delete the topk_indices and topk_weights from the state_dict + del state_dict[k] + del state_dict[k.replace("_topk_indices", "_topk_weights")] + + peft_model_state_dict = _insert_adapter_name_into_state_dict( + state_dict, adapter_name=adapter_name, parameter_prefix=parameter_prefix + ) + + if config.peft_type == PeftType.ADALORA: + rank_pattern = config.rank_pattern + if rank_pattern is not None: + model.resize_modules_by_rank_pattern(rank_pattern, adapter_name) + elif config.peft_type == PeftType.SHIRA: + if platform.system() == "Windows": + warnings.warn( + "Windows has issues saving integers into safetensors. Hence, we had converted shira_indices " + "to float32 before saving on Windows OS. The shira_indices will always be converted to integers " + "when loading." + ) + for name, module in model.named_modules(): + if hasattr(module, "shira_indices"): + # for k, v in module.shira_indices.items(): + if f"{name}.shira_indices.{adapter_name}" in peft_model_state_dict: + shira_indices_values = peft_model_state_dict.pop(f"{name}.shira_indices.{adapter_name}") + # Convert shira_indices to int in case they were saved on a Windows OS and are being loaded + # on a Linux or a Mac-OS system. If they were saved in Linux or Mac-OS, they are already + # integers and the following will not affect anything. + module.shira_indices[adapter_name] = shira_indices_values.to(torch.int) + elif config.peft_type == PeftType.VERA: + if config.save_projection and "base_model.vera_A" not in peft_model_state_dict: + raise ValueError( + "Specified to load vera_A and vera_B from state dictionary however they were not present!" + ) + elif not config.save_projection and "base_model.vera_A" in peft_model_state_dict: + warnings.warn( + "Specified to not load vera_A and vera_B from state dictionary however they are present in state" + " dictionary! Consider using them to ensure checkpoint loading is correct on all platforms using" + " `peft_config.save_projection = True`" + ) + elif not config.save_projection: # and no vera_A in state dictionary + warnings.warn( + "Specified to not load vera_A and vera_B from state dictionary. This means we will be relying on" + " PRNG initialisation to restore these projections using `config.projection_prng_key`, which may" + " not be accurate on all system configurations." + ) + elif config.peft_type == PeftType.LORA: + # Here we take care of a refactor of DoRA which changed lora_magnitude_vector from a ParameterDict to a + # ModuleDict with a DoraLayer instance. The old parameter is now the "weight" attribute of that layer. + old_dora_suffix = f"lora_magnitude_vector.{adapter_name}" + + def renamed_dora_weights(k): + if k.endswith(old_dora_suffix): + k = k + ".weight" + return k + + peft_model_state_dict = {renamed_dora_weights(k): v for k, v in peft_model_state_dict.items()} + elif config.peft_type == PeftType.OFT: + if any(".oft_r." in key for key in peft_model_state_dict): + raise ValueError( + "Trying to load old OFT checkpoint, which is no longer supported. Please install PEFT <= v0.15.2 to load it or train a new OFT adapter." + ) + else: + raise NotImplementedError + + peft_model_state_dict, mismatched_keys = _find_mismatched_keys( + model, peft_model_state_dict, ignore_mismatched_sizes=ignore_mismatched_sizes + ) + if low_cpu_mem_usage: + load_result = model.load_state_dict(peft_model_state_dict, strict=False, assign=True) + # ensure that the correct device is set + for module in model.modules(): + if hasattr(module, "_move_adapter_to_device_of_base_layer"): + module._move_adapter_to_device_of_base_layer(adapter_name) + else: + load_result = model.load_state_dict(peft_model_state_dict, strict=False) + + if config.is_prompt_learning: + model.prompt_encoder[adapter_name].embedding.load_state_dict( + {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True + ) + + if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + model.prompt_encoder[adapter_name].load_state_dict(peft_model_state_dict, strict=False) + + if mismatched_keys: + # see https://github.com/huggingface/transformers/blob/09f9f566de83eef1f13ee83b5a1bbeebde5c80c1/src/transformers/modeling_utils.py#L4039 + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + msg = ( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint " + f"and are being ignored because you passed `ignore_mismatched_sizes=True`: {mismatched_warning}." + ) + warnings.warn(msg) + return load_result + + +# TODO: remove this function, use vanilla torch.load as soon as torch < 2.6.0 is no longer supported +def torch_load(*args, weights_only=True, **kwargs): + """Call torch.load and handle weights_only. + + Defaults to weights_only=True to anticipate upcoming switch on the PyTorch side. + + """ + return torch.load(*args, weights_only=weights_only, **kwargs) + + +def load_peft_weights( + model_id: str, device: Optional[str] = None, key_mapping: Optional[dict[str, str]] = None, **hf_hub_download_kwargs +) -> dict: + r""" + A helper method to load the PEFT weights from the HuggingFace Hub or locally + + Args: + model_id (`str`): + The local path to the adapter weights or the name of the adapter to load from the HuggingFace Hub. + device (`str`): + The device to load the weights onto. + key_mapping (dict, *optional*, defaults to None) + Extra mapping of PEFT `state_dict` keys applied before loading the `state_dict`. When this mapping is + applied, the PEFT-specific `"base_model.model"` prefix is removed beforehand and the adapter name (e.g. + `"default"`) is not inserted yet. Only pass this argument if you know what you're doing. + hf_hub_download_kwargs (`dict`): + Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub. + """ + path = ( + os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) + if hf_hub_download_kwargs.get("subfolder", None) is not None + else model_id + ) + + if device is None: + device = infer_device() + + def get_hub_filename(use_safetensors=True): + weights_name = SAFETENSORS_WEIGHTS_NAME if use_safetensors else WEIGHTS_NAME + return ( + os.path.join(hf_hub_download_kwargs["subfolder"], weights_name) + if hf_hub_download_kwargs.get("subfolder", None) is not None + else weights_name + ) + + if "user_agent" not in hf_hub_download_kwargs: + hf_hub_download_kwargs["user_agent"] = http_user_agent() + + if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)): + filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME) + use_safetensors = True + elif os.path.exists(os.path.join(path, WEIGHTS_NAME)): + filename = os.path.join(path, WEIGHTS_NAME) + use_safetensors = False + elif huggingface_hub.constants.HF_HUB_OFFLINE: + # if in offline mode, check if we can find the adapter file locally + hub_filename = get_hub_filename(use_safetensors=True) + hf_hub_download_kwargs.pop("local_files_only", None) + try: + filename = hf_hub_download(model_id, hub_filename, local_files_only=True, **hf_hub_download_kwargs) + use_safetensors = True + except LocalEntryNotFoundError: + # Could not find safetensors, try pickle. If this also fails, it's fine to let the error be raised here, as + # it means that the user tried to load a non-cached model in offline mode. + hub_filename = get_hub_filename(use_safetensors=False) + filename = hf_hub_download(model_id, hub_filename, local_files_only=True, **hf_hub_download_kwargs) + use_safetensors = False + else: + token = hf_hub_download_kwargs.get("token", None) + if token is None: + token = hf_hub_download_kwargs.get("use_auth_token", None) + + hub_filename = get_hub_filename(use_safetensors=True) + has_remote_safetensors_file = file_exists( + repo_id=model_id, + filename=hub_filename, + revision=hf_hub_download_kwargs.get("revision", None), + repo_type=hf_hub_download_kwargs.get("repo_type", None), + token=token, + ) + use_safetensors = has_remote_safetensors_file + + if has_remote_safetensors_file: + # Priority 1: load safetensors weights + filename = hf_hub_download( + model_id, + SAFETENSORS_WEIGHTS_NAME, + **hf_hub_download_kwargs, + ) + else: + try: + filename = hf_hub_download(model_id, WEIGHTS_NAME, **hf_hub_download_kwargs) + except EntryNotFoundError: + raise ValueError( + f"Can't find weights for {model_id} in {model_id} or in the Hugging Face Hub. " + f"Please check that the file {WEIGHTS_NAME} or {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}." + ) + + if use_safetensors: + if hasattr(torch.backends, "mps") and (device == torch.device("mps")): + adapters_weights = safe_load_file(filename, device="cpu") + else: + adapters_weights = safe_load_file(filename, device=device) + else: + adapters_weights = torch_load(filename, map_location=torch.device(device)) + + if not key_mapping: + remapped_adapters_weights = adapters_weights + else: + # See discussion in https://github.com/huggingface/transformers/pull/38627 + # Remap adapter weight names according to the provided key_mapping. + remapped_adapters_weights = {} + for key, val in adapters_weights.items(): + if key.startswith("base_model.model."): + prefix = "base_model.model." + elif key.startswith("base_model."): + prefix = "base_model." + else: + raise ValueError( + "An error occurred while trying to load a PEFT state_dict with key_mapping. This should not " + "happen. Please open an issue on https://github.com/huggingface/peft/issues and report the error." + ) + + key = key.removeprefix(prefix) # the key map assumes that there is no prefix + for pattern, replacement in key_mapping.items(): + key_new, n_replace = re.subn(pattern, replacement, key) + # Early exit of the loop + if n_replace > 0: + key = key_new + break + key_with_prefix = f"{prefix}{key}" + remapped_adapters_weights[key_with_prefix] = val + + return remapped_adapters_weights diff --git a/peft/src/peft/utils/warning.py b/peft/src/peft/utils/warning.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2afeb85abf836273c3f7534eea80aebc82d256 --- /dev/null +++ b/peft/src/peft/utils/warning.py @@ -0,0 +1,17 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class PeftWarning(UserWarning): + """Base PEFT warning""" diff --git a/peft/tests/__init__.py b/peft/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/tests/bnb/test_bnb_regression.py b/peft/tests/bnb/test_bnb_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..26570fe43004de87462b92d7b012939ba639ce02 --- /dev/null +++ b/peft/tests/bnb/test_bnb_regression.py @@ -0,0 +1,258 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file contains very basic regression tests for bitsandbytes +# It currently lives in the PEFT code base but should be moved to bnb eventually. +# These tests are very simplifistic and crude on purpose. If useful, they can be cleaned up and refactored later. + +# Note that we make no assumptions about the correctness of the output, we only check that they didn't change +# unexpectedly. + +# The expected values are generated by running the test until we have the `output`, then pass it to `bytes_from_tensor` + +import io + +import pytest +import torch +from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig + +from peft.import_utils import is_xpu_available + + +bnb = pytest.importorskip("bitsandbytes") + +device = torch.device("xpu") if is_xpu_available() else torch.device("cuda") + + +def bytes_from_tensor(x): + # helper function to create the expected output for regression testing + f = io.BytesIO() + torch.save(x, f) + x_bytes = f.getvalue() + f.close() + return x_bytes + + +############ +# OPT-125M # +############ + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.") +def test_opt_350m_4bit(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\xfc\xd3\xff\xc00\xfe\xfe\xc0&eR@\x19j\x8d@,O\x1e?\xe9\xfb\x0bA\xcc\xb5OA\xc6?\xd6@\xd3\xc2\xe0@PK\x07\x08\xdb\xad]I$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521679285581783PK\x07\x08\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xdb\xad]I$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +def test_opt_350m_8bit(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZN\t\xae\xbfR.\x8d\xbf\x88\xae\x01A@\x11\xb1@v\xae\x00@o\xc2\x14AJpNA-\x08\x0cACI\xf6@PK\x07\x08\xfe\xdb\xb9o$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521667500867612PK\x07\x08\xb0\xb5\xcf\xfe(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xfe\xdb\xb9o$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xb0\xb5\xcf\xfe(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.") +def test_opt_350m_4bit_double_quant(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ.\xe3\xfe\xc0H\xaa\xfe\xc0\xf6\x9aS@\xbe\x9c\x8b@\x06\x93\x1a?\xe8&\x0cA\x9f\x0cPA\xd4\xf4\xd6@V\xa3\xe1@PK\x07\x08J\x98\xbfQ$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521700249059421PK\x07\x08\x9cW<\xe0(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00J\x98\xbfQ$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x9cW<\xe0(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA device available.") +def test_opt_350m_4bit_compute_dtype_float16(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float16, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\xfc\xd3\xff\xc00\xfe\xfe\xc0&eR@\x19j\x8d@,O\x1e?\xe9\xfb\x0bA\xcc\xb5OA\xc6?\xd6@\xd3\xc2\xe0@PK\x07\x08\xdb\xad]I$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521679285581783PK\x07\x08\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xdb\xad]I$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +def test_opt_350m_4bit_quant_type_nf4(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + bnb_4bit_quant_type="nf4", + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ8\x18\xeb>\xd4\x82\x14\xbej\xbe\xff@:\xb9|@\x19\xb8\xb4?\xac\xae\x07A\x94iXA\xc8\x12\x13AHu\xdd@PK\x07\x08\xe1\xec\x0f\xf2$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521529449342366PK\x07\x08\xbf\xb8\xd6H(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xe1\xec\x0f\xf2$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xbf\xb8\xd6H(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +def test_opt_350m_4bit_quant_storage(): + # note: using torch.float32 instead of the default torch.uint8 does not seem to affect the result + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + bnb_4bit_quant_storage=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\xfc\xd3\xff\xc00\xfe\xfe\xc0&eR@\x19j\x8d@,O\x1e?\xe9\xfb\x0bA\xcc\xb5OA\xc6?\xd6@\xd3\xc2\xe0@PK\x07\x08\xdb\xad]I$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521679285581783PK\x07\x08\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xdb\xad]I$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x93\x10\xf6E(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +def test_opt_350m_8bit_threshold(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_8bit=True, + llm_int8_threshold=3.0, # default is 6.0 + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model(input).logits[0, :3, :3].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ttq\x05QK\x00K\x03K\x03\x86q\x06K\x03K\x01\x86q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00'\x00archive/byteorderFB#\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZR\xd5\x14\xc0\xc3\x9b\xf1\xbf \x9d\xde@D\x17\xc4@\t\xd1\x16@(\x97\x16A#TXA>\xdd\x12A\x08\x03\xfb@PK\x07\x08F\xd1\x87\xa3$\x00\x00\x00$\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1f\x00archive/versionFB\x1b\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001521620583262466PK\x07\x08\x87\x89*\x93(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x99G\x1f\xb7\x9a\x00\x00\x00\x9a\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xea\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00F\xd1\x87\xa3$\x00\x00\x00$\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf4\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x87\x89*\x93(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +########### +# FLAN-T5 # +########### + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +def test_flan_t5_4bit(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForSeq2SeqLM.from_pretrained( + "google/flan-t5-base", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model.generate(input_ids=input, return_dict_in_generate=True, output_scores=True) + output = output.scores[0][0, :10].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ntq\x05QK\x00K\n\x85q\x06K\x01\x85q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x19\xea\x16n\x96\x00\x00\x00\x96\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00+\x00archive/byteorderFB'\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZpb\x0f\xc2\x91\xa3\x85\xc0\x86\xee\x83\xc0\xae\xea\xdc?F\xad-\xc1\xe4*k\xc0\x12\x84\x86\xc09\xf9\xc8\xc0|\x861\xc0m\xf7\x0c\xc1PK\x07\x08\xf1y:\xda(\x00\x00\x00(\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1b\x00archive/versionFB\x17\x00ZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001223527302082336PK\x07\x08~n}q(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x19\xea\x16n\x96\x00\x00\x00\x96\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe6\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xf1y:\xda(\x00\x00\x00(\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf8\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00~n}q(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="No CUDA or XPU device available.") +@pytest.mark.xfail # might not be reproducible depending on hardware +def test_flan_t5_8bit(): + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + model = AutoModelForSeq2SeqLM.from_pretrained( + "google/flan-t5-base", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(device) + with torch.no_grad(): + output = model.generate(input_ids=input, return_dict_in_generate=True, output_scores=True) + output = output.scores[0][0, :10].detach().cpu() + + expected_bytes = b"PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x12\x00archive/data.pklFB\x0e\x00ZZZZZZZZZZZZZZ\x80\x02ctorch._utils\n_rebuild_tensor_v2\nq\x00((X\x07\x00\x00\x00storageq\x01ctorch\nFloatStorage\nq\x02X\x01\x00\x00\x000q\x03X\x03\x00\x00\x00cpuq\x04K\ntq\x05QK\x00K\n\x85q\x06K\x01\x85q\x07\x89ccollections\nOrderedDict\nq\x08)Rq\ttq\nRq\x0b.PK\x07\x08\x19\xea\x16n\x96\x00\x00\x00\x96\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11\x00+\x00archive/byteorderFB'\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZlittlePK\x07\x08\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e\x00>\x00archive/data/0FB:\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ\xebd)\xc2\xac\x1c\xba\xc0F\x0c\xbf\xc0v\\\x88?\x9f\x7fW\xc1H\xbd\xa0\xc0\xf4\xaf\xaf\xc0@:\x02\xc1\xbcjr\xc0\xf7\x95$\xc1PK\x07\x08\x12\xcc\x86\x12(\x00\x00\x00(\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x1b\x00archive/versionFB\x17\x00ZZZZZZZZZZZZZZZZZZZZZZZ3\nPK\x07\x08\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00PK\x03\x04\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1e\x002\x00archive/.data/serialization_idFB.\x00ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ0576858857385996278200001226216142756281PK\x07\x08\xa0Z\xf3\xd2(\x00\x00\x00(\x00\x00\x00PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x19\xea\x16n\x96\x00\x00\x00\x96\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00archive/data.pklPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x85=\xe3\x19\x06\x00\x00\x00\x06\x00\x00\x00\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe6\x00\x00\x00archive/byteorderPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\x12\xcc\x86\x12(\x00\x00\x00(\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00archive/data/0PK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xd1\x9egU\x02\x00\x00\x00\x02\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf8\x01\x00\x00archive/versionPK\x01\x02\x00\x00\x00\x00\x08\x08\x00\x00\x00\x00\x00\x00\xa0Z\xf3\xd2(\x00\x00\x00(\x00\x00\x00\x1e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00R\x02\x00\x00archive/.data/serialization_idPK\x06\x06,\x00\x00\x00\x00\x00\x00\x00\x1e\x03-\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00B\x01\x00\x00\x00\x00\x00\x00\xf8\x02\x00\x00\x00\x00\x00\x00PK\x06\x07\x00\x00\x00\x00:\x04\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00PK\x05\x06\x00\x00\x00\x00\x05\x00\x05\x00B\x01\x00\x00\xf8\x02\x00\x00\x00\x00" + expected = torch.load(io.BytesIO(expected_bytes)) + torch.testing.assert_allclose(output, expected) diff --git a/peft/tests/conftest.py b/peft/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..866ae8853675eefcb995222970a720b9bd5c7ba1 --- /dev/null +++ b/peft/tests/conftest.py @@ -0,0 +1,86 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import platform +import re + +import pytest + + +def pytest_addoption(parser): + parser.addoption("--regression", action="store_true", default=False, help="run regression tests") + + +def pytest_configure(config): + config.addinivalue_line("markers", "regression: mark regression tests") + + # Errors from transformers deprecations + logger = logging.getLogger("transformers") + + class ErrorOnDeprecation(logging.Handler): + def emit(self, record): + msg = record.getMessage().lower() + if "deprecat" in msg or "future" in msg: + if "torch_dtype" not in msg: + # let's ignore the torch_dtype => dtype deprecation for now + raise AssertionError(f"**Transformers Deprecation**: {msg}") + + # Add our handler + handler = ErrorOnDeprecation() + logger.addHandler(handler) + logger.setLevel(logging.WARNING) + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--regression"): + return + + skip_regression = pytest.mark.skip(reason="need --regression option to run regression tests") + for item in items: + if "regression" in item.keywords: + item.add_marker(skip_regression) + + +# TODO: remove this once support for PyTorch 2.2 (the latest one still supported by GitHub MacOS x86_64 runners) is +# dropped, or if MacOS is removed from the test matrix, see https://github.com/huggingface/peft/issues/2431. +# Note: the function name is fixed by the pytest plugin system, don't change it +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_makereport(item, call): + """ + Plug into the pytest test report generation to skip a specific MacOS failure caused by transformers. + + The error was introduced by https://github.com/huggingface/transformers/pull/37785, which results in torch.load + failing when using torch < 2.6. + + Since the MacOS x86 runners need to use an older torch version, those steps are necessary to get the CI green. + """ + outcome = yield + rep = outcome.get_result() + # ref: + # https://github.com/huggingface/transformers/blob/858ce6879a4aa7fa76a7c4e2ac20388e087ace26/src/transformers/utils/import_utils.py#L1418 + error_msg = re.compile(r"Due to a serious vulnerability issue in `torch.load`") + + # notes: + # - pytest uses hard-coded strings, we cannot import and use constants + # https://docs.pytest.org/en/stable/reference/reference.html#pytest.TestReport + # - errors can happen during call (running the test) but also setup (e.g. in fixtures) + if rep.failed and (rep.when in ("setup", "call")) and (platform.system() == "Darwin"): + exc_msg = str(call.excinfo.value) + if error_msg.search(exc_msg): + # turn this failure into an xfail: + rep.outcome = "skipped" + # for this attribute, see: + # https://github.com/pytest-dev/pytest/blob/bd6877e5874b50ee57d0f63b342a67298ee9a1c3/src/_pytest/reports.py#L266C5-L266C13 + rep.wasxfail = "Error known to occur on MacOS with older torch versions, won't be fixed" diff --git a/peft/tests/regression/__init__.py b/peft/tests/regression/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/peft/tests/regression/test_regression.py b/peft/tests/regression/test_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..0e999971056118282455f5b82e98f95701fd8d17 --- /dev/null +++ b/peft/tests/regression/test_regression.py @@ -0,0 +1,665 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Regression testing: check that checkpoints from previous PEFT versions still return the same values. +# +# For normal regression testing, just run: +# +# `pytest tests/regression/test_regression.py -s --regression` +# +# Add `-s` to show potentially useful debugging information. `--regression` is a custom marker that is required for +# regression tests not to be skipped. +# +# To create new regression tests, run: +# `HF_TOKEN= REGRESSION_CREATION_MODE=True pytest tests/regression/test_regression.py -s --regression` +# +# This will *fail* if: +# +# 1. the git worktree is dirty +# 2. the git commit is not tagged +# +# Note: A Hugging Face Hub token is required to upload the regression artifacts to our +# https://huggingface.co/peft-internal-testing repo. This can be done by anyone with write access to the repo but +# apparently it is not possible to create a technical token with write access. +# +# This is important to ensure that the regression artifacts correspond to a specific released version of PEFT. +# Therefore, it is recommended to checkout the tag before running the regression tests, e.g. by running: +# +# `git checkout v0.1.0` +# +# To override these checks, run: +# ``HF_TOKEN= REGRESSION_CREATION_MODE=True REGRESSION_FORCE_MODE=True pytest tests/regression/test_regression.py -s --regression` +# +# In REGRESSION_CREATION_MODE, one directory will be created in tests/regression/// for each +# test. This will contain the saved adapter, as well as the output of the test of the model for that version. +# +# In normal testing mode, the saved adapter and output for each version found in the directory +# tests/regression// will be loaded and compared to the current output. +# +# When implementing new tests, check the existing ones as well as the description in the docstring of RegressionTester. + +import os +import shutil +import subprocess +import sys +import tempfile +import unittest + +import pytest +import torch +from huggingface_hub import snapshot_download, upload_folder +from torch import nn +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +from transformers.pytorch_utils import Conv1D + +import peft +from peft import ( + AdaLoraConfig, + BOFTConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + PeftModel, + VBLoRAConfig, + VeraConfig, + get_peft_model, +) +from peft.utils import infer_device + +from ..testing_utils import require_bitsandbytes, require_deterministic_for_xpu, require_non_cpu + + +PEFT_VERSION = peft.__version__ +REGRESSION_DIR = tempfile.mkdtemp(prefix="peft_regression_") +HF_TOKEN = os.environ.get("HF_TOKEN") +# the repo has to be created manually once, it is not automatically created +HF_REPO = "peft-internal-testing/regression-tests" + + +@pytest.fixture(scope="session", autouse=True) +def setup_tearndown(): + # Use a pytest session-scoped fixture to setup and teardown exactly once per session. AFAICT, unittest does not + # provide such a feature + + # download regression artifacts from Hugging Face Hub at the start + snapshot_download(repo_id=HF_REPO, local_dir=REGRESSION_DIR) + yield + + # delete regression artifacts at the end of the test session; optionally, upload them first if in creation mode + creation_mode = strtobool(os.environ.get("REGRESSION_CREATION_MODE", "False")) + if creation_mode: + # upload the regression directory to Hugging Face Hub, will overwrite by default + upload_folder( + repo_id=HF_REPO, + folder_path=REGRESSION_DIR, + token=HF_TOKEN, + ) + + shutil.rmtree(REGRESSION_DIR) + + +def strtobool(val): + """Copied from distutils.util""" + val = val.lower() + if val in ("y", "yes", "t", "true", "on", "1"): + return 1 + elif val in ("n", "no", "f", "false", "off", "0"): + return 0 + else: + raise ValueError(f"invalid truth value {val!r}") + + +def save_output(output, name, force=False): + path = os.path.join(REGRESSION_DIR, name, PEFT_VERSION) + filename = os.path.join(path, "output.pt") + if os.path.exists(filename) and not force: + return + + if not os.path.exists(path): + os.makedirs(path) + + if os.path.exists(filename) and force: + print(f"Overriding existing output in {filename}", file=sys.stderr) + + torch.save(output, filename) + + +def save_model(model, name, force=False): + path = os.path.join(REGRESSION_DIR, name, PEFT_VERSION) + filename = os.path.join(path, peft.utils.SAFETENSORS_WEIGHTS_NAME) + if os.path.exists(filename) and not force: + return + + if not os.path.exists(path): + os.makedirs(path) + + if os.path.exists(filename) and force: + print(f"Overriding existing model in {path}", file=sys.stderr) + + model.save_pretrained(path) + + +def load_output(name): + filename = os.path.join(REGRESSION_DIR, name, "output.pt") + return torch.load(filename, map_location=infer_device()) + + +@pytest.mark.regression +class RegressionTester(unittest.TestCase): + """Base class for regression testing + + Child classes must call assert_results_equal_or_store and pass the model outtput, as well as a unique name that + describes the setting (e.g. "lora_opt-350m_bnb_4bit"). They also need to implement get_output(model) to get the + model output, and load_base_model(name) to load the base model. Don't forget to fix the seed in load_base_model. + """ + + torch_device = infer_device() + + def setUp(self): + self.tol = 1e-4 + self.creation_mode = strtobool(os.environ.get("REGRESSION_CREATION_MODE", "False")) + self.force_mode = strtobool(os.environ.get("REGRESSION_FORCE_MODE", "False")) + if self.force_mode and not self.creation_mode: + raise RuntimeError("REGRESSION_FORCE_MODE can only be used together with REGRESSION_CREATION_MODE") + if self.creation_mode: + self.check_clean_git_status(self.force_mode) + if HF_TOKEN is None: + raise RuntimeError("HF_TOKEN environment variable must be set in creation mode") + + def fix_seed(self): + torch.manual_seed(0) + + def check_clean_git_status(self, force): + """Ensure that worktree is not dirty and version tag is checked out""" + # check that the worktree is clean + try: + subprocess.check_output(["git", "diff", "--quiet", "HEAD"]) + except subprocess.CalledProcessError as exc: + if force: + print("Overriding despite dirty git worktree", file=sys.stderr) + else: + raise RuntimeError("Git worktree is dirty") from exc + + # check that the commit is tagged + try: + subprocess.check_output(["git", "describe", "--exact-match", "HEAD"]) + except subprocess.CalledProcessError as exc: + if force: + print("Overriding despite non-tagged commit", file=sys.stderr) + else: + raise RuntimeError("Git commit is not tagged") from exc + + @require_deterministic_for_xpu + def assert_results_equal_or_store(self, model, name): + """Check if the outputs are the same or save the outputs if in creation mode.""" + if not self.creation_mode: # normal regression testing mode + self._assert_results_equal(name) + else: + output = self.get_output(model) + if not torch.isfinite(output).all(): + raise RuntimeError(f"Model output for {name} is not finite") + + output2 = self.get_output(model) + if not torch.allclose(output, output2): + raise RuntimeError(f"Model output for {name} is not deterministic") + + save_output(output, name, force=self.force_mode) + save_model(model, name, force=self.force_mode) + + def _assert_results_equal(self, name): + path = os.path.join(REGRESSION_DIR, name) + versions = os.listdir(path) + for version in versions: # each directory corresponds to a version + output_loaded = load_output(os.path.join(name, version)) + base_model = self.load_base_model() + model = PeftModel.from_pretrained(base_model, os.path.join(path, version)) + output = self.get_output(model) + assert torch.allclose(output_loaded, output, atol=self.tol, rtol=self.tol) + + def get_output(self, model): + raise NotImplementedError + + def load_base_model(self): + raise NotImplementedError + + +############## +# TEST CASES # +############## + + +class TestMlp(RegressionTester): + def get_output(self, model): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + with torch.inference_mode(): + output = model(input) + return output + + def load_base_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.sm(X) + return X + + self.fix_seed() + return MLP().to(self.torch_device) + + def test_lora(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + target_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_mlp") + + def test_lora_dora(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + target_modules=["lin0"], + use_dora=True, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_dora_mlp") + + def test_adalora(self): + base_model = self.load_base_model() + config = AdaLoraConfig( + r=8, + init_lora_weights=False, + target_modules=["lin0"], + total_step=1, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "adalora_mlp") + + def test_ia3(self): + base_model = self.load_base_model() + config = IA3Config( + init_ia3_weights=False, + target_modules=["lin0"], + feedforward_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "ia3_mlp") + + def test_ia3_no_ff(self): + base_model = self.load_base_model() + config = IA3Config( + init_ia3_weights=False, + target_modules=["lin0"], + feedforward_modules=[], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "ia3_no_ff_mlp") + + def test_loha(self): + # TODO + self.skipTest("Skipping LoHa for now because init is not seedable") + base_model = self.load_base_model() + config = LoHaConfig( + r=8, + init_weights=False, + target_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "loha_mlp") + + def test_lokr(self): + # TODO + self.skipTest("Skipping LoKr for now because init is not seedable") + base_model = self.load_base_model() + config = LoKrConfig( + r=8, + target_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lokr_mlp") + + def test_lora_modules_to_save(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + target_modules=["lin0"], + modules_to_save=["lin1"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_mlp_modules_to_save") + + def test_boft(self): + base_model = self.load_base_model() + config = BOFTConfig( + boft_block_size=2, + target_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "boft_mlp") + + def test_ln_tuning(self): + base_model = self.load_base_model() + config = LNTuningConfig(target_modules=["lin0"]) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "ln_tuning_mlp") + + def test_vera_tuning(self): + base_model = self.load_base_model() + config = VeraConfig(target_modules=["lin0"]) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "vera_tuning_mlp") + + def test_vblora_tuning(self): + base_model = self.load_base_model() + config = VBLoRAConfig( + vector_length=1, + num_vectors=2, + target_modules=["lin0"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "vblora_tuning_mlp") + + +class TestLoraEmbConv1D(RegressionTester): + def get_output(self, model): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + with torch.inference_mode(): + output = model(input) + return output + + def load_base_model(self): + class ModelEmbConv1D(nn.Module): + def __init__(self): + super().__init__() + self.emb = nn.Embedding(100, 5) + self.conv1d = Conv1D(1, 5) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.emb(X) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + self.fix_seed() + return ModelEmbConv1D().to(self.torch_device) + + def test_lora(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + target_modules=["emb", "conv1d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_emb_conv1d") + + +class TestLoraConv2D(RegressionTester): + def get_output(self, model): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + with torch.inference_mode(): + output = model(input) + return output + + def load_base_model(self): + class ModelConv2D(nn.Module): + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(5, 10, 3) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float().reshape(2, 5, 3, 3) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + self.fix_seed() + return ModelConv2D().to(self.torch_device) + + def test_lora(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + target_modules=["conv2d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_conv2d") + + def test_ia3(self): + base_model = self.load_base_model() + config = IA3Config( + init_ia3_weights=False, + target_modules=["conv2d"], + feedforward_modules=["conv2d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "ia3_conv2d") + + def test_loha(self): + # TODO + self.skipTest("Skipping LoHa for now because init is not seedable") + base_model = self.load_base_model() + config = LoHaConfig( + r=8, + init_weights=False, + target_modules=["conv2d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "loha_conv2d") + + def test_lokr(self): + # TODO + self.skipTest("Skipping LoKr for now because init is not seedable") + base_model = self.load_base_model() + config = LoKrConfig( + r=8, + init_weights=False, + target_modules=["conv2d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lokr_conv2d") + + def test_boft(self): + base_model = self.load_base_model() + config = BOFTConfig( + boft_block_size=3, + target_modules=["conv2d"], + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "boft_conv2d") + + +class TestOpt(RegressionTester): + def get_output(self, model): + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(self.torch_device) + with torch.inference_mode(): + output = model(input).logits + return output + + def load_base_model(self): + self.fix_seed() + return AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to(self.torch_device) + + def test_lora(self): + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_opt-350m") + + def test_adalora(self): + base_model = self.load_base_model() + config = AdaLoraConfig( + r=8, + init_lora_weights=False, + total_step=1, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "adalora_opt-350m") + + def test_ia3(self): + base_model = self.load_base_model() + config = IA3Config(init_ia3_weights=False) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "ia3_opt-350m") + + +@require_non_cpu +@require_bitsandbytes +class TestOpt8bitBnb(RegressionTester): + def get_output(self, model): + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(self.torch_device) + with torch.inference_mode(): + output = model(input).logits + return output + + def load_base_model(self): + self.fix_seed() + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + return model + + def test_lora_8bit(self): + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_opt-350m_bnb_8bit") + + def test_adalora(self): + # TODO + self.skipTest( + "Skipping AdaLora for now, getting TypeError: unsupported operand type(s) for +=: 'dict' and 'Tensor'" + ) + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. + base_model = self.load_base_model() + config = AdaLoraConfig( + init_r=6, + target_r=4, + tinit=50, + tfinal=100, + total_step=200, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "adalora_opt-350m_8bit") + + +@require_non_cpu +@require_bitsandbytes +class TestOpt4bitBnb(RegressionTester): + def get_output(self, model): + input = torch.LongTensor([[1, 0, 1, 0, 1, 2]]).to(self.torch_device) + with torch.inference_mode(): + output = model(input).logits + return output + + def load_base_model(self): + self.fix_seed() + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-350m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + return model + + def test_lora_4bit(self): + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. + base_model = self.load_base_model() + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "lora_opt-350m_bnb_4bit") + + def test_adalora(self): + # TODO + self.skipTest("Skipping AdaLora for now because of a bug, see #1113") + # Warning: bnb results can vary significantly depending on the GPU. Therefore, if there is a change in GPU used + # in the CI, the test can fail without any code change. In that case, delete the regression artifact and create + # a new one using the new GPU. + base_model = self.load_base_model() + config = AdaLoraConfig( + init_r=6, + target_r=4, + tinit=50, + tfinal=100, + total_step=200, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(base_model, config) + self.assert_results_equal_or_store(model, "adalora_opt-350m_4bit") diff --git a/peft/tests/test_adaption_prompt.py b/peft/tests/test_adaption_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..09e020e0f3918a26e832f0d8ce9cc84cb8ec1469 --- /dev/null +++ b/peft/tests/test_adaption_prompt.py @@ -0,0 +1,416 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import pytest +import torch +from torch.testing import assert_close +from transformers import AutoModelForCausalLM + +from peft import get_peft_model +from peft.peft_model import PeftModel +from peft.tuners.adaption_prompt import AdaptionPromptConfig +from peft.utils import infer_device +from peft.utils.other import prepare_model_for_kbit_training +from peft.utils.save_and_load import get_peft_model_state_dict + + +MODELS_TO_TEST = [ + "hf-internal-testing/tiny-random-gpt2", + "trl-internal-testing/tiny-random-LlamaForCausalLM", + "hf-internal-testing/tiny-random-MistralForCausalLM", +] + + +class TestAdaptionPrompt: + """ + Tests for the AdaptionPrompt model. + + Some of these tests were adapted from `test_peft_model.py` (which has been refactored since), but since we haven't + checked in the test checkpoints for Llama into `hf-internal-testing`, we separate them for now. + """ + + transformers_class = AutoModelForCausalLM + torch_device = infer_device() + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_attributes(self, model_id): + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=1, adapter_len=4) + model = get_peft_model(model, config) + + assert hasattr(model, "save_pretrained") + assert hasattr(model, "from_pretrained") + assert hasattr(model, "push_to_hub") + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_prepare_for_training(self, model_id): + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=1, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device) + dummy_output = model.get_input_embeddings()(dummy_input) + + assert not dummy_output.requires_grad + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_prepare_for_int8_training(self, model_id): + model = self.transformers_class.from_pretrained(model_id) + model = prepare_model_for_kbit_training(model) + model = model.to(self.torch_device) + + for param in model.parameters(): + assert not param.requires_grad + + config = AdaptionPromptConfig(adapter_layers=1, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + + # For backward compatibility + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device) + dummy_output = model.get_input_embeddings()(dummy_input) + + assert dummy_output.requires_grad + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_save_pretrained_regression(self, model_id): + seed = 420 + torch.manual_seed(seed) + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname, safe_serialization=False) + + torch.manual_seed(seed) + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + # check if the state dicts are equal + state_dict = get_peft_model_state_dict(model) + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate). + assert len(state_dict) == 4 + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + # check if `adapter_model.bin` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `model.safetensors` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "model.safetensors")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_save_pretrained(self, model_id): + seed = 420 + torch.manual_seed(seed) + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + torch.manual_seed(seed) + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + # check if the state dicts are equal + state_dict = get_peft_model_state_dict(model) + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate). + assert len(state_dict) == 4 + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + # check if `adapter_model.bin` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `model.safetensors` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "model.safetensors")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_save_pretrained_selected_adapters(self, model_id): + seed = 420 + torch.manual_seed(seed) + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + new_adapter_config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + model.add_adapter("new_adapter", new_adapter_config) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + torch.manual_seed(seed) + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + model_from_pretrained.load_adapter(tmp_dirname, "new_adapter") + + # check if the state dicts are equal + state_dict = get_peft_model_state_dict(model) + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate). + assert len(state_dict) == 4 + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + # check if `adapter_model.bin` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `model.safetensors` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "model.safetensors")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_generate(self, model_id): + model = self.transformers_class.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + # check if `generate` works + _ = model.generate(input_ids=input_ids, attention_mask=attention_mask) + + # check if `generate` works if positional arguments are passed + _ = model.generate(input_ids, attention_mask=attention_mask) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_sequence_adapter_ops(self, model_id): + """Test sequence of adapter operations.""" + # Test input data. + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + target_ids = torch.LongTensor([[0, 0, 0], [0, 0, 0]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + # Create original llama model. + original = self.transformers_class.from_pretrained(model_id) + original = original.to(self.torch_device) + original_before = original(input_ids=input_ids, attention_mask=attention_mask) + + # Get AdaptionPrompt model. + adapted = get_peft_model( + original, AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + ) + adapted = adapted.to(self.torch_device) + default_before = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + + # Test zero-init: The logits should be exactly the same. + assert_close(original_before.logits, default_before.logits, rtol=0, atol=0) + + # Single fine-tuning step on "default" adapter. + optimizer = torch.optim.SGD(adapted.parameters(), lr=1) + optimizer.zero_grad() + default_before.loss.backward() + optimizer.step() + + # Test that the output changed. + default_after = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert not torch.allclose(default_before.logits, default_after.logits) + + with adapted.disable_adapter(): + # Test that the output is the same as the original output. + default_disabled = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(original_before.logits, default_disabled.logits, rtol=0, atol=0) + + # Add new adapter 1. + adapted.add_adapter("adapter 1", AdaptionPromptConfig(adapter_layers=2, adapter_len=8, task_type="CAUSAL_LM")) + # Test zero-init + adapter_1_before = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(original_before.logits, adapter_1_before.logits, rtol=0, atol=0) + + # Single fine-tuning step on adapter 1. + optimizer = torch.optim.SGD(adapted.parameters(), lr=1) + optimizer.zero_grad() + adapter_1_before.loss.backward() + optimizer.step() + + # Test that adapter 1 output changed. + adapter_1_after = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert not torch.allclose(adapter_1_before.logits, adapter_1_after.logits) + assert not torch.allclose(original_before.logits, adapter_1_after.logits) + assert not torch.allclose(default_after.logits, adapter_1_after.logits) + + with adapted.disable_adapter(): + # Test that the output is the same as the original output. + adapter_1_disabled = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(original_before.logits, adapter_1_disabled.logits, rtol=0, atol=0) + + # Set adapter back to default. + adapted.set_adapter("default") + + # Test that the output is the same as the default output after training. + default_after_set = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(default_after.logits, default_after_set.logits, rtol=0, atol=0) + assert not torch.allclose(original_before.logits, default_after_set.logits) + assert not torch.allclose(adapter_1_after.logits, default_after_set.logits) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_add_and_set_while_disabled(self, model_id): + """Test that adding and setting adapters while disabled works as intended.""" + # Test input data. + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + target_ids = torch.LongTensor([[0, 0, 0], [0, 0, 0]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + # Create original llama model. + original = self.transformers_class.from_pretrained(model_id) + original = original.to(self.torch_device) + original_before = original(input_ids=input_ids, attention_mask=attention_mask) + + # Get AdaptionPrompt model. + adapted = get_peft_model( + original, AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + ) + adapted = adapted.to(self.torch_device) + + with adapted.disable_adapter(): + adapted.add_adapter( + "adapter 1", AdaptionPromptConfig(adapter_layers=2, adapter_len=8, task_type="CAUSAL_LM") + ) + + # Test that the output is the same as the original output. + adapter_1_before = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(original_before.logits, adapter_1_before.logits, rtol=0, atol=0) + + # Single fine-tuning step on adapter 1. + optimizer = torch.optim.SGD(adapted.parameters(), lr=1) + optimizer.zero_grad() + adapter_1_before.loss.backward() + optimizer.step() + + # Test that adapter 1 output changed. + adapter_1_after = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert not torch.allclose(original_before.logits, adapter_1_after.logits) + + adapted.set_adapter("default") + with adapted.disable_adapter(): + adapted.set_adapter("adapter 1") + + # Test that adapter 1 is active again. + adapter_1_after_set = adapted(input_ids=input_ids, attention_mask=attention_mask, labels=target_ids) + assert_close(adapter_1_after.logits, adapter_1_after_set.logits, rtol=0, atol=0) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_use_cache(self, model_id): + """Test that AdaptionPrompt works when Llama config use_cache=True.""" + torch.manual_seed(0) + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + original = self.transformers_class.from_pretrained(model_id, use_cache=False) + adapted = get_peft_model( + original, AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + ) + adapted = adapted.to(self.torch_device) + expected = adapted.generate(input_ids=input_ids, max_length=8) + + # Set use_cache = True and generate output again. + adapted.base_model.config.use_cache = True + actual = adapted.generate(input_ids=input_ids, max_length=8) + assert_close(expected, actual, rtol=0, atol=0) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_bf16_inference(self, model_id): + if self.torch_device == "mps": + return pytest.skip("Skipping bf16 test on MPS") + + """Test that AdaptionPrompt works when Llama using a half-precision model.""" + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + original = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16) + adapted = get_peft_model( + original, AdaptionPromptConfig(adapter_layers=2, adapter_len=4, task_type="CAUSAL_LM") + ) + adapted = adapted.to(self.torch_device) + adapted.generate(input_ids=input_ids) # does not raise + + @pytest.mark.xfail(reason="currently this fails because scores are zeroed out", raises=AssertionError) + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_disable_adapter(self, model_id): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device) + output_before = model(dummy_input).logits + + config = AdaptionPromptConfig(adapter_layers=1, adapter_len=4, task_type="CAUSAL_LM") + model = get_peft_model(model, config).to(self.torch_device) + output_peft = model(dummy_input).logits + # TODO currently this fails because scores are zeroed out: + # https://github.com/huggingface/peft/blob/062d95a09eb5d1de35c0e5e23d4387daba99e2db/src/peft/tuners/adaption_prompt.py#L303 + # This is fine for users but makes it difficult to test if anything happens. In the future, we will have a clean + # way to control initialization. Until then, this test is expected to fail. + assert not torch.allclose(output_before, output_peft) + + with model.disable_adapter(): + output_peft_disabled = model(dummy_input).logits + assert torch.allclose(output_before, output_peft_disabled) diff --git a/peft/tests/test_arrow.py b/peft/tests/test_arrow.py new file mode 100644 index 0000000000000000000000000000000000000000..a6fab059ff528fa5e7ab283dea08ac06b9c0f269 --- /dev/null +++ b/peft/tests/test_arrow.py @@ -0,0 +1,509 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +from pathlib import Path +from unittest.mock import patch + +import pytest +import torch +from transformers import AutoModelForCausalLM, AutoModelForImageClassification + +from peft import LoraConfig, get_peft_model +from peft.tuners.lora import ArrowConfig, create_arrow_model +from peft.tuners.lora.arrow import _resolve_adapter_source +from tests.testing_utils import hub_online_once + + +# ─── Fixtures ────────────────────────────────────────────────────────── + + +@pytest.fixture(scope="module") +def workdir(tmp_path_factory): + """ + Create a temp directory and chdir into it for the duration of the module. + """ + wd = tmp_path_factory.mktemp("arrow_workdir") + old_cwd = os.getcwd() + os.chdir(wd) + yield Path(wd) + os.chdir(old_cwd) + # (pytest will auto-delete wd) + + +def _create_and_save_adapter(out_dir: Path, rank: int = 4): + """Helper: build a LoRA adapter around `model` and save into `out_dir`.""" + # fan_in_fan_out is set to True because of GPT2 model that we use to avoid warning + cfg = LoraConfig(r=rank, target_modules=["c_attn"], fan_in_fan_out=True, init_lora_weights=False) + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + peft_model = get_peft_model(model, cfg) + peft_model.save_pretrained(out_dir) + + +@pytest.fixture(scope="module") +def ts_adapters(workdir: Path): + """ + Build 3 task-specific adapters and return their absolute paths + """ + abs_paths = [] + for i in range(3): + sub = f"{workdir}/ts{i}" + _create_and_save_adapter(sub) + abs_paths.append(sub) + return abs_paths + + +@pytest.fixture(scope="module") +def gen_adapter(workdir: Path): + """Build 1 general-knowledge adapter and return its absolute path list.""" + sub = f"{workdir}/gen0" + _create_and_save_adapter(sub) + return [sub] # list because create_arrow_model expects list + + +class TestArrowRouting: + def test_incompatible_rank_raises(self, workdir: Path): + """ + Adding adapters with different ranks must raise a ValueError. + """ + # Create two adapters with different ranks targeting the same modules + sub_r4 = workdir / "rank4" + sub_r8 = workdir / "rank8" + _create_and_save_adapter(sub_r4, rank=4) + _create_and_save_adapter(sub_r8, rank=8) + + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base = AutoModelForCausalLM.from_pretrained(model_id) + + # Expect create_arrow_model to raise due to rank mismatch + with pytest.raises(ValueError, match=r"rank mismatch"): + _ = create_arrow_model( + base_model=base, + task_specific_adapter_paths=[str(sub_r4), str(sub_r8)], + arrow_config=ArrowConfig(top_k=1), + ) + + def test_arrow_differs_with_extra_expert(self, ts_adapters): + """ + Arrow with 2 experts vs Arrow with 3 experts must produce different logits. + """ + # Arrow over first 2 experts + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model_1 = AutoModelForCausalLM.from_pretrained(model_id) + base_model_2 = copy.deepcopy(base_model_1) + cfg_small = ArrowConfig(top_k=2) + m_small = create_arrow_model( + base_model=base_model_1, + task_specific_adapter_paths=ts_adapters[:2], + arrow_config=cfg_small, + ).eval() + + # Arrow over all 3 experts + cfg_big = ArrowConfig(top_k=2) + m_big = create_arrow_model( + base_model=base_model_2, + task_specific_adapter_paths=ts_adapters, + arrow_config=cfg_big, + ).eval() + + x = torch.ones(1, 4, dtype=torch.long) + assert not torch.allclose(m_small(x).logits, m_big(x).logits) + + def test_arrow_gks_with_load_adapter_later_with_forward(self, ts_adapters, gen_adapter): + """ + Loading the last expert after creating the arrow model should produce the same result as loading all the + experts at once in create_arrow_model(), when forward path is called before adding the new adapter. + """ + # Arrow over all three experts + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model_1 = AutoModelForCausalLM.from_pretrained(model_id) + base_model_2 = copy.deepcopy(base_model_1) + cfg_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_big = create_arrow_model( + base_model=base_model_1, + task_specific_adapter_paths=ts_adapters, + general_adapter_paths=gen_adapter, + arrow_config=cfg_big, + ).eval() + + # Arrow over all 2 experts + loading the third expert later + cfg_small_later_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_small_later_big = create_arrow_model( + base_model=base_model_2, + task_specific_adapter_paths=ts_adapters[:2], + general_adapter_paths=gen_adapter, + arrow_config=cfg_small_later_big, + ) + + # Ensuring that the prototypes and gks are done one time by running a forward path + x = torch.ones(1, 4, dtype=torch.long) + m_small_later_big(x) + + # Now loading the third expert + m_small_later_big.load_adapter( + model_id=ts_adapters[-1], + adapter_name="new_added_ts_expert", + ) + # Activating the new adapter and run forward path on it + m_small_later_big.set_adapter("new_added_ts_expert") + x = torch.ones(3, 5, dtype=torch.long) + m_small_later_big(x) + + # Now we switch back to the arrow_router + m_small_later_big.set_adapter("arrow_router") + m_small_later_big.eval() + + x = torch.ones(1, 4, dtype=torch.long) + assert torch.allclose(m_big(x).logits, m_small_later_big(x).logits) + + def test_arrow_with_load_adapter_later_with_forward_activate_new(self, ts_adapters, gen_adapter): + """ + Loading the last expert after creating the arrow model and activate it should produce different result compared + to the case where arrow_router is activate, and the model's using arrow. + """ + # Arrow over all three experts + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model_1 = AutoModelForCausalLM.from_pretrained(model_id) + base_model_2 = copy.deepcopy(base_model_1) + cfg_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_big = create_arrow_model( + base_model=base_model_1, + task_specific_adapter_paths=ts_adapters, + general_adapter_paths=gen_adapter, + arrow_config=cfg_big, + ).eval() + + # Arrow over all 2 experts + loading the third expert later + cfg_small_later_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_small_later_big = create_arrow_model( + base_model=base_model_2, + task_specific_adapter_paths=ts_adapters[:2], + general_adapter_paths=gen_adapter, + arrow_config=cfg_small_later_big, + ) + + # Ensuring that the prototypes and gks are done one time by running a forward path + x = torch.ones(1, 4, dtype=torch.long) + m_small_later_big(x) + + # Now loading the third expert + m_small_later_big.load_adapter( + model_id=ts_adapters[-1], + adapter_name="new_added_ts_expert", + ) + # The new adapter is activated + m_small_later_big.set_adapter("new_added_ts_expert") + m_small_later_big.eval() + + x = torch.ones(1, 4, dtype=torch.long) + assert not torch.allclose(m_big(x).logits, m_small_later_big(x).logits) + + def test_arrow_gks_with_load_adapter_later_without_forward(self, ts_adapters, gen_adapter): + """ + Loading the last expert after creating the arrow model should produce the same result as loading all the + experts at once in create_arrow_model() + """ + # Arrow over all three experts + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model_1 = AutoModelForCausalLM.from_pretrained(model_id) + base_model_2 = copy.deepcopy(base_model_1) + cfg_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_big = create_arrow_model( + base_model=base_model_1, + task_specific_adapter_paths=ts_adapters, + general_adapter_paths=gen_adapter, + arrow_config=cfg_big, + ).eval() + + # Arrow over all 2 experts + loading the third expert later + cfg_small_later_big = ArrowConfig(top_k=2, use_gks=True, rng_seed=42) + m_small_later_big = create_arrow_model( + base_model=base_model_2, + task_specific_adapter_paths=ts_adapters[:2], + general_adapter_paths=gen_adapter, + arrow_config=cfg_small_later_big, + ) + + # Now loading the third expert + m_small_later_big.load_adapter( + model_id=ts_adapters[-1], + adapter_name="new_added_ts_expert", + ) + m_small_later_big.eval() + + x = torch.ones(1, 4, dtype=torch.long) + assert torch.allclose(m_big(x).logits, m_small_later_big(x).logits) + + def test_genknowsub_changes_output(self, ts_adapters, gen_adapter): + """ + Arrow+GenKnowSub vs plain Arrow must change logits. + """ + # Plain Arrow + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model_1 = AutoModelForCausalLM.from_pretrained(model_id) + base_model_2 = copy.deepcopy(base_model_1) + cfg_plain = ArrowConfig(top_k=2) + m_plain = create_arrow_model( + base_model=base_model_1, + task_specific_adapter_paths=ts_adapters, + arrow_config=cfg_plain, + ).eval() + + # Arrow + GenKnowSub + cfg_gks = ArrowConfig(top_k=2, use_gks=True) + m_gks = create_arrow_model( + base_model=base_model_2, + task_specific_adapter_paths=ts_adapters, + general_adapter_paths=gen_adapter, + arrow_config=cfg_gks, + ).eval() + + x = torch.ones(1, 4, dtype=torch.long) + assert not torch.allclose(m_plain(x).logits, m_gks(x).logits) + + def test_merging_adapters_raise_error_in_arrow(self, ts_adapters): + """ + Merging/unmerging is not allowed while an ArrowLinearLayer is loaded on the model and active. + """ + # Arrow over first 2 experts + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base_model = AutoModelForCausalLM.from_pretrained(model_id) + cfg_small = ArrowConfig(top_k=2) + m_small = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=ts_adapters[:2], + arrow_config=cfg_small, + ).eval() + + with pytest.raises(RuntimeError, match=r"Cannot merge an active Arrow router adapter"): + m_small.merge_and_unload() + + def test_conv2d_targets_raise_typeerror_in_arrow(self, workdir): + """ + Adapters applied to Conv2d must be rejected by create_arrow_model() which enforces Linear/Linear4bit-only + targets. + """ + + model_id = "hf-internal-testing/tiny-random-ResNetForImageClassification" + with hub_online_once(model_id): + base = AutoModelForImageClassification.from_pretrained(model_id) + + # Build a LoRA adapter targeting a Conv2d + cfg = LoraConfig(r=4, target_modules=["convolution"], init_lora_weights=False) + peft_model = get_peft_model(copy.deepcopy(base), cfg) + + conv_dir = workdir / "cv0" + peft_model.save_pretrained(conv_dir) + + # Expect create_arrow_model to raise TypeError + with pytest.raises(TypeError, match=r"LoRA adapters must only target Linear"): + _ = create_arrow_model( + base_model=base, + task_specific_adapter_paths=[str(conv_dir)], + arrow_config=ArrowConfig(top_k=1), + ) + + def test_arrow_forward_float16_no_autocast_with_merging(self, ts_adapters): + """ + Run Arrow in float16 with autocast disabled; forward should work, while merge/unmerge operations must raise for + Arrow models. + """ + import platform + + try: + _ = torch.zeros(1, dtype=torch.float16) + except Exception: + pytest.skip(reason="Test requires float16 support") + + if platform.system() == "Darwin": + pytest.skip(reason="MacOS does not support multiple ops in float16") + + model_id = "hf-internal-testing/tiny-random-gpt2" + + # Create base in fp16 (no manual assignment to .dtype) + with hub_online_once(model_id): + base = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) + + cfg = ArrowConfig(top_k=2) + + # Build Arrow model and disable adapter dtype autocast + model = create_arrow_model( + base_model=base, + task_specific_adapter_paths=ts_adapters, + arrow_config=cfg, + autocast_adapter_dtype=False, + torch_dtype=torch.float16, + ).eval() + + X = { + "input_ids": torch.ones(1, 4, dtype=torch.long), + "attention_mask": torch.ones(1, 4, dtype=torch.long), + } + + # Forward should work in fp16 + _ = model(**X) + + # Merge must fail on Arrow models + with pytest.raises(RuntimeError, match=r"Cannot merge an active Arrow router adapter"): + model.merge_adapter(safe_merge=False) + + with pytest.raises(RuntimeError, match=r"Cannot merge an active Arrow router adapter"): + _ = model.merge_and_unload() + + def test_prototypes_not_recomputed_on_repeated_forward(self, ts_adapters): + """ + Repeated calls to forward should not recompute prototypes. We verify by spying on + ArrowLoraLinearLayer.top_right_singular_vec_from_BA(), which is only called when prototypes are (re)built. + """ + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base = AutoModelForCausalLM.from_pretrained(model_id) + + cfg = ArrowConfig(top_k=2) + model = create_arrow_model( + base_model=base, + task_specific_adapter_paths=ts_adapters, + arrow_config=cfg, + ).eval() + + # Find one Arrow layer instance on the model + arrow_layer = None + for _, module in model.named_modules(): + if hasattr(module, "lora_arrow") and "arrow_router" in module.lora_arrow: + arrow_layer = module.lora_arrow["arrow_router"] + break + assert arrow_layer is not None, "Arrow router layer not found on model" + + x = torch.ones(1, 4, dtype=torch.long) + + # Spy on the internal proto computation; should run once (E calls for E experts) + with patch.object( + arrow_layer, + "top_right_singular_vec_from_BA", + wraps=arrow_layer.top_right_singular_vec_from_BA, + ) as spy: + _ = model(x) + first_calls = spy.call_count + assert first_calls == len(arrow_layer.task_adapter_names) + + # Call forward again; prototypes should be cached, so no extra calls + _ = model(x) + assert spy.call_count == first_calls + + +def test_training_updates_when_task_adapter_active(ts_adapters): + """ + Ensure a simple training step works: compute a dummy loss, backward, and take an optimizer step. Verify that + task-adapter parameters update. + """ + model_id = "hf-internal-testing/tiny-random-gpt2" + with hub_online_once(model_id): + base = AutoModelForCausalLM.from_pretrained(model_id) + + # Build Arrow model over two experts + cfg = ArrowConfig(top_k=2) + model = create_arrow_model( + base_model=base, + task_specific_adapter_paths=ts_adapters[:2], + arrow_config=cfg, + ) + model.train() + + # Switch to a specific task adapter for training (vanilla LoRA) + model.set_adapter("task_0") + + # Choose a representative parameter to check updates (task_0 A weight) + rep_name = None + for n, _ in model.named_parameters(): + if ".lora_A.task_0.weight" in n: + rep_name = n + break + assert rep_name is not None, "task_0 LoRA A weight not found" + rep_param = dict(model.named_parameters())[rep_name] + before = rep_param.detach().clone() + + # Optimizer over trainable params (task_0 now active and trainable) + opt = torch.optim.SGD([p for p in model.parameters() if p.requires_grad], lr=1e-2) + + # Dummy batch + vocab = model.config.vocab_size + input_ids = torch.randint(0, vocab, (2, 8)) + attention_mask = torch.ones_like(input_ids) + + # Compute loss and update + opt.zero_grad() + out = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids) + assert hasattr(out, "loss") and out.loss is not None + out.loss.backward() + opt.step() + + after = rep_param.detach().clone() + assert not torch.allclose(before, after), "Active task adapter parameters did not update after optimizer step" + + +@pytest.mark.parametrize( + "case", + [ + "local_root", + "local_nested", + "hub_repo", + "hub_with_sub", + ], +) +def test_resolve_adapter_source_variants(tmp_path: Path, case: str): + """ + Ensure `_resolve_adapter_source` correctly handles: + - Local dir (containing adapter_config.json) + - Local nested subfolder + - Hub repo id "user/repo" + - Hub repo with subfolder "user/repo/sub/folder" + """ + if case == "local_root": + d = tmp_path / "adapter_local_root" + d.mkdir(parents=True, exist_ok=True) + (d / "adapter_config.json").write_text("{}") + model_id, sub = _resolve_adapter_source(str(d)) + assert model_id == str(d) + assert sub is None + + elif case == "local_nested": + d = tmp_path / "repo_like" / "sub" / "folder" + d.mkdir(parents=True, exist_ok=True) + (d / "adapter_config.json").write_text("{}") + model_id, sub = _resolve_adapter_source(str(d)) + assert model_id == str(d) + assert sub is None + + elif case == "hub_repo": + model_id, sub = _resolve_adapter_source("user/repo") + assert model_id == "user/repo" + assert sub is None + + elif case == "hub_with_sub": + model_id, sub = _resolve_adapter_source("user/repo/sub/folder") + assert model_id == "user/repo" + assert sub == "sub/folder" + + else: + raise AssertionError(f"unknown case: {case}") diff --git a/peft/tests/test_auto.py b/peft/tests/test_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..105d13f455a9f80a74b43127f1a7b009ca5259aa --- /dev/null +++ b/peft/tests/test_auto.py @@ -0,0 +1,231 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import ( + AutoPeftModel, + AutoPeftModelForCausalLM, + AutoPeftModelForFeatureExtraction, + AutoPeftModelForQuestionAnswering, + AutoPeftModelForSeq2SeqLM, + AutoPeftModelForSequenceClassification, + AutoPeftModelForTokenClassification, + LoraConfig, + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, + get_peft_model, +) +from peft.utils import infer_device + + +class TestPeftAutoModel: + dtype = torch.float16 if infer_device() == "mps" else torch.bfloat16 + + def test_peft_causal_lm(self): + model_id = "peft-internal-testing/tiny-OPTForCausalLM-lora" + model = AutoPeftModelForCausalLM.from_pretrained(model_id) + assert isinstance(model, PeftModelForCausalLM) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForCausalLM.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForCausalLM) + + # check if kwargs are passed correctly + model = AutoPeftModelForCausalLM.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForCausalLM) + assert model.base_model.lm_head.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForCausalLM.from_pretrained(model_id, adapter_name, is_trainable, torch_dtype=self.dtype) + + def test_peft_causal_lm_extended_vocab(self): + model_id = "peft-internal-testing/tiny-random-OPTForCausalLM-extended-vocab" + model = AutoPeftModelForCausalLM.from_pretrained(model_id) + assert isinstance(model, PeftModelForCausalLM) + + # check if kwargs are passed correctly + model = AutoPeftModelForCausalLM.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForCausalLM) + assert model.base_model.lm_head.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForCausalLM.from_pretrained(model_id, adapter_name, is_trainable, torch_dtype=self.dtype) + + def test_peft_seq2seq_lm(self): + model_id = "peft-internal-testing/tiny_T5ForSeq2SeqLM-lora" + model = AutoPeftModelForSeq2SeqLM.from_pretrained(model_id) + assert isinstance(model, PeftModelForSeq2SeqLM) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForSeq2SeqLM.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForSeq2SeqLM) + + # check if kwargs are passed correctly + model = AutoPeftModelForSeq2SeqLM.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForSeq2SeqLM) + assert model.base_model.lm_head.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForSeq2SeqLM.from_pretrained(model_id, adapter_name, is_trainable, torch_dtype=self.dtype) + + def test_peft_sequence_cls(self): + model_id = "peft-internal-testing/tiny_OPTForSequenceClassification-lora" + model = AutoPeftModelForSequenceClassification.from_pretrained(model_id) + assert isinstance(model, PeftModelForSequenceClassification) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForSequenceClassification.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForSequenceClassification) + + # check if kwargs are passed correctly + model = AutoPeftModelForSequenceClassification.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForSequenceClassification) + assert model.score.original_module.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForSequenceClassification.from_pretrained( + model_id, adapter_name, is_trainable, torch_dtype=self.dtype + ) + + def test_peft_token_classification(self): + model_id = "peft-internal-testing/tiny_GPT2ForTokenClassification-lora" + model = AutoPeftModelForTokenClassification.from_pretrained(model_id) + assert isinstance(model, PeftModelForTokenClassification) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForTokenClassification.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForTokenClassification) + + # check if kwargs are passed correctly + model = AutoPeftModelForTokenClassification.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForTokenClassification) + assert model.base_model.classifier.original_module.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForTokenClassification.from_pretrained( + model_id, adapter_name, is_trainable, torch_dtype=self.dtype + ) + + def test_peft_question_answering(self): + model_id = "peft-internal-testing/tiny_OPTForQuestionAnswering-lora" + model = AutoPeftModelForQuestionAnswering.from_pretrained(model_id) + assert isinstance(model, PeftModelForQuestionAnswering) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForQuestionAnswering.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForQuestionAnswering) + + # check if kwargs are passed correctly + model = AutoPeftModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForQuestionAnswering) + assert model.base_model.qa_outputs.original_module.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForQuestionAnswering.from_pretrained( + model_id, adapter_name, is_trainable, torch_dtype=self.dtype + ) + + def test_peft_feature_extraction(self): + model_id = "peft-internal-testing/tiny_OPTForFeatureExtraction-lora" + model = AutoPeftModelForFeatureExtraction.from_pretrained(model_id) + assert isinstance(model, PeftModelForFeatureExtraction) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModelForFeatureExtraction.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModelForFeatureExtraction) + + # check if kwargs are passed correctly + model = AutoPeftModelForFeatureExtraction.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModelForFeatureExtraction) + assert model.base_model.model.decoder.embed_tokens.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModelForFeatureExtraction.from_pretrained( + model_id, adapter_name, is_trainable, torch_dtype=self.dtype + ) + + def test_peft_whisper(self): + model_id = "peft-internal-testing/tiny_WhisperForConditionalGeneration-lora" + model = AutoPeftModel.from_pretrained(model_id) + assert isinstance(model, PeftModel) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model = AutoPeftModel.from_pretrained(tmp_dirname) + assert isinstance(model, PeftModel) + + # check if kwargs are passed correctly + model = AutoPeftModel.from_pretrained(model_id, torch_dtype=self.dtype) + assert isinstance(model, PeftModel) + assert model.base_model.model.model.encoder.embed_positions.weight.dtype == self.dtype + + adapter_name = "default" + is_trainable = False + # This should work + _ = AutoPeftModel.from_pretrained(model_id, adapter_name, is_trainable, torch_dtype=self.dtype) + + def test_embedding_size_not_reduced_if_greater_vocab_size(self, tmp_path): + # See 2415 + # There was a bug in AutoPeftModels where the embedding was always resized to the vocab size of the tokenizer + # when the tokenizer was found. This makes sense if the vocabulary was extended, but some models like Qwen + # already start out with "spare" embeddings, i.e. the embedding size is larger than the vocab size. This could + # result in the embedding being shrunk, which in turn resulted in an error when loading the weights. + + # first create a checkpoint; it is important that the tokenizer is also saved in the same location + model_id = "Qwen/Qwen2-0.5B" + model = AutoModelForCausalLM.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = get_peft_model(model, LoraConfig(modules_to_save=["lm_head", "embed_token"])) + model.save_pretrained(tmp_path) + tokenizer.save_pretrained(tmp_path) + + # does not raise; without the fix, it raises: + # > size mismatch for base_model.model.lm_head.modules_to_save.default.weight: copying a param with shape + # torch.Size([151936, 896]) from checkpoint, the shape in current model is torch.Size([151646, 896]). + AutoPeftModelForCausalLM.from_pretrained(tmp_path) diff --git a/peft/tests/test_boft.py b/peft/tests/test_boft.py new file mode 100644 index 0000000000000000000000000000000000000000..d0cf74e3edea2a5836068afe2b74df3b8ae6a20f --- /dev/null +++ b/peft/tests/test_boft.py @@ -0,0 +1,84 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from safetensors.torch import load_file +from transformers import AutoModelForCausalLM + +from peft import BOFTConfig, PeftModel, get_peft_model +from peft.utils import infer_device + + +class TestBoft: + device = infer_device() + + def test_boft_state_dict(self, tmp_path): + # see #2050 + # ensure that the boft_P buffer is not stored in the checkpoint file and is not necessary to load the model + # correctly + torch.manual_seed(0) + + inputs = torch.arange(10).view(-1, 1).to(self.device) + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + model.eval() + output_base = model(inputs).logits + + config = BOFTConfig(init_weights=False) + model = get_peft_model(model, config) + model.eval() + output_peft = model(inputs).logits + + atol, rtol = 1e-5, 1e-8 + # sanity check: loading boft changed the output + assert not torch.allclose(output_base, output_peft, atol=atol, rtol=rtol) + + model.save_pretrained(tmp_path) + del model + + # check that the boft_P buffer is not present + state_dict = load_file(tmp_path / "adapter_model.safetensors") + assert not any("boft_P" in key for key in state_dict) + + # sanity check: the model still produces the same output after loading + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + model = PeftModel.from_pretrained(model, tmp_path) + output_loaded = model(inputs).logits + assert torch.allclose(output_peft, output_loaded, atol=atol, rtol=rtol) + + def test_boft_old_checkpoint_including_boft_P(self, tmp_path): + # see #2050 + # This test exists to ensure that after the boft_P buffer was made non-persistent, old checkpoints can still be + # loaded successfully. + torch.manual_seed(0) + + inputs = torch.arange(10).view(-1, 1).to(self.device) + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + + # first create the expected output + config = BOFTConfig(init_weights=False) + model = get_peft_model(model, config) + model.eval() + output_peft = model(inputs).logits + del model + + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + # checkpoint from before the PR whose state_dict still contains boft_P + hub_id = "peft-internal-testing/boft-tiny-opt-peft-v0.12" + model = PeftModel.from_pretrained(model, hub_id) + output_old = model(inputs).logits + + atol, rtol = 1e-5, 1e-8 + assert torch.allclose(output_peft, output_old, atol=atol, rtol=rtol) diff --git a/peft/tests/test_bufferdict.py b/peft/tests/test_bufferdict.py new file mode 100644 index 0000000000000000000000000000000000000000..eda25e652b4e3f799113c56c36a66330d5c415cd --- /dev/null +++ b/peft/tests/test_bufferdict.py @@ -0,0 +1,48 @@ +import torch + +from peft.tuners._buffer_dict import BufferDict + + +class TestBufferDict: + def test_init_from_dict_works(self): + bd = BufferDict( + { + "default": torch.randn(10, 2), + } + ) + + def test_update_from_other_bufferdict(self): + default_tensor = torch.randn(10, 2) + non_default_tensor = torch.randn(10, 2) + bd1 = BufferDict({"default": default_tensor}) + bd2 = BufferDict({"non_default": non_default_tensor}) + + bd1.update(bd2) + + assert set(bd1.keys()) == {"default", "non_default"} + assert torch.allclose(bd1["default"], default_tensor) + assert torch.allclose(bd1["non_default"], non_default_tensor) + + def test_update_from_dict(self): + default_tensor = torch.randn(10, 2) + non_default_tensor = torch.randn(10, 2) + bd1 = BufferDict({"default": default_tensor}) + d1 = {"non_default": non_default_tensor} + + bd1.update(d1) + + assert set(bd1.keys()) == {"default", "non_default"} + assert torch.allclose(bd1["default"], default_tensor) + assert torch.allclose(bd1["non_default"], non_default_tensor) + + def test_update_from_dict_items(self): + default_tensor = torch.randn(10, 2) + non_default_tensor = torch.randn(10, 2) + bd1 = BufferDict({"default": default_tensor}) + d1 = {"non_default": non_default_tensor} + + bd1.update(d1.items()) + + assert set(bd1.keys()) == {"default", "non_default"} + assert torch.allclose(bd1["default"], default_tensor) + assert torch.allclose(bd1["non_default"], non_default_tensor) diff --git a/peft/tests/test_common_gpu.py b/peft/tests/test_common_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..958126ad1b644fca4ab5f7351935333ab27c0a3c --- /dev/null +++ b/peft/tests/test_common_gpu.py @@ -0,0 +1,2185 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gc +import tempfile +import unittest + +import pytest +import torch +import torch.nn.functional as F +from accelerate.utils.memory import clear_device_cache +from parameterized import parameterized +from torch import nn +from transformers import ( + AutoImageProcessor, + AutoModelForCausalLM, + AutoModelForImageClassification, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoTokenizer, + BitsAndBytesConfig, + LlamaForCausalLM, + WhisperForConditionalGeneration, +) +from transformers.pytorch_utils import Conv1D + +from peft import ( + AdaLoraConfig, + AdaptionPromptConfig, + BOFTConfig, + HRAConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + OFTConfig, + PeftModel, + RandLoraConfig, + RoadConfig, + TaskType, + VBLoRAConfig, + VeraConfig, + get_peft_model, + prepare_model_for_kbit_training, +) +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_xpu_available +from peft.tuners.lora.config import LoraRuntimeConfig +from peft.utils import infer_device + +from .testing_utils import ( + device_count, + load_cat_image, + require_bitsandbytes, + require_deterministic_for_xpu, + require_non_cpu, + require_torch_multi_accelerator, +) + + +if is_bnb_available(): + import bitsandbytes as bnb + + from peft.tuners.ia3 import Linear8bitLt as IA3Linear8bitLt + from peft.tuners.lora import Linear8bitLt as LoraLinear8bitLt + from peft.tuners.randlora import Linear8bitLt as RandLoraLinear8bitLt + from peft.tuners.road import Linear8bitLt as RoadLinear8bitLt + from peft.tuners.vera import Linear8bitLt as VeraLinear8bitLt + + if is_bnb_4bit_available(): + from peft.tuners.ia3 import Linear4bit as IA3Linear4bit + from peft.tuners.lora import Linear4bit as LoraLinear4bit + from peft.tuners.randlora import Linear4bit as RandLoraLinear4bit + from peft.tuners.road import Linear4bit as RoadLinear4bit + from peft.tuners.vera import Linear4bit as VeraLinear4bit + + +@require_non_cpu +class PeftGPUCommonTests(unittest.TestCase): + r""" + A common tester to run common operations that are performed on GPU such as generation, loading in 8bit, etc. + """ + + def setUp(self): + self.seq2seq_model_id = "google/flan-t5-base" + self.causal_lm_model_id = "facebook/opt-350m" + self.audio_model_id = "openai/whisper-large" + self.device = infer_device() + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + gc.collect() + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_lora_bnb_8bit_quantization(self): + r""" + Test that tests if the 8bit quantization using LoRA works as expected + """ + whisper_8bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + opt_8bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_lora_config = LoraConfig( + r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none") + + flan_8bit = get_peft_model(flan_8bit, flan_lora_config) + assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt) + + opt_8bit = get_peft_model(opt_8bit, opt_lora_config) + assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) + + whisper_8bit = get_peft_model(whisper_8bit, config) + assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_vera_bnb_8bit_quantization(self): + r""" + Test that tests if the 8bit quantization using VeRA works as expected + """ + whisper_8bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + opt_8bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_vera_config = VeraConfig( + r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_vera_config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none") + + flan_8bit = get_peft_model(flan_8bit, flan_vera_config) + assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear8bitLt) + + opt_8bit = get_peft_model(opt_8bit, opt_vera_config) + assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt) + + whisper_8bit = get_peft_model(whisper_8bit, config) + assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear8bitLt) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_randlora_bnb_8bit_quantization(self): + r""" + Test that tests if the 8bit quantization using RandLora works as expected + """ + whisper_8bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + opt_8bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_randlora_config = RandLoraConfig( + r=16, target_modules=["q", "v"], randlora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_randlora_config = RandLoraConfig( + r=10, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = RandLoraConfig(r=5, target_modules=["q_proj", "v_proj"], randlora_dropout=0.05, bias="none") + + flan_8bit = get_peft_model(flan_8bit, flan_randlora_config) + assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RandLoraLinear8bitLt) + + opt_8bit = get_peft_model(opt_8bit, opt_randlora_config) + assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear8bitLt) + + whisper_8bit = get_peft_model(whisper_8bit, config) + assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear8bitLt) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_ia3_bnb_8bit_quantization(self): + r""" + Test that tests if the 8bit quantization using IA3 works as expected + """ + whisper_8bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + opt_8bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") + + opt_ia3_config = IA3Config( + target_modules=["q_proj", "v_proj", "fc2"], + feedforward_modules=["fc2"], + task_type="CAUSAL_LM", + ) + + config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"]) + + flan_8bit = get_peft_model(flan_8bit, flan_ia3_config) + assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear8bitLt) + + opt_8bit = get_peft_model(opt_8bit, opt_ia3_config) + assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt) + + whisper_8bit = get_peft_model(whisper_8bit, config) + assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear8bitLt) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_road_bnb_8bit_quantization(self): + r""" + Test that tests if the 8bit quantization using Road works as expected + """ + whisper_8bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + opt_8bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_8bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + flan_road_config = RoadConfig(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") + + opt_road_config = RoadConfig( + target_modules=["q_proj", "v_proj", "fc2"], + task_type="CAUSAL_LM", + ) + + config = RoadConfig(target_modules=["q_proj", "v_proj", "fc2"]) + + flan_8bit = get_peft_model(flan_8bit, flan_road_config) + assert isinstance(flan_8bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RoadLinear8bitLt) + + opt_8bit = get_peft_model(opt_8bit, opt_road_config) + assert isinstance(opt_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear8bitLt) + + whisper_8bit = get_peft_model(whisper_8bit, config) + assert isinstance(whisper_8bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear8bitLt) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + @parameterized.expand(["4bit", "8bit"]) + def test_lora_bnb_quantization_from_pretrained_safetensors(self, quantization): + r""" + Tests that the bnb quantization using LoRA works as expected with safetensors weights. + """ + model_id = "facebook/opt-350m" + peft_model_id = "ybelkada/test-st-lora" + kwargs = {"device_map": "auto"} + if quantization == "4bit": + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) + else: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + model = PeftModel.from_pretrained(model, peft_model_id) + + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(peft_model_id, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + @parameterized.expand(["4bit", "8bit"]) + def test_adalora_bnb_quantization_from_pretrained_safetensors(self, quantization): + r""" + Tests that the bnb quantization using AdaLora works as expected with safetensors weights. + """ + model_id = "facebook/opt-350m" + kwargs = {"device_map": "auto"} + if quantization == "4bit": + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) + else: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + config = AdaLoraConfig(task_type=TaskType.CAUSAL_LM, total_step=1) + peft_model = get_peft_model(model, config) + peft_model = prepare_model_for_kbit_training(peft_model) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(peft_model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + @parameterized.expand(["4bit", "8bit"]) + def test_vera_bnb_quantization_from_pretrained_safetensors(self, quantization): + r""" + Tests that the bnb quantization using VeRA works as expected with safetensors weights. + """ + model_id = "facebook/opt-350m" + kwargs = {"device_map": "auto"} + if quantization == "4bit": + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) + else: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + config = VeraConfig(task_type=TaskType.CAUSAL_LM) + peft_model = get_peft_model(model, config) + peft_model = prepare_model_for_kbit_training(peft_model) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.vera_A + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + @parameterized.expand(["4bit", "8bit"]) + def test_randlora_bnb_quantization_from_pretrained_safetensors(self, quantization): + r""" + Tests that the bnb quantization using RandLora works as expected with safetensors weights. + """ + model_id = "facebook/opt-350m" + kwargs = {"device_map": "auto"} + if quantization == "4bit": + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) + else: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + config = RandLoraConfig(task_type=TaskType.CAUSAL_LM) + peft_model = get_peft_model(model, config) + peft_model = prepare_model_for_kbit_training(peft_model) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.randlora_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.randlora_A + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + @parameterized.expand(["4bit", "8bit"]) + def test_ia3_bnb_quantization_from_pretrained_safetensors(self, quantization): + r""" + Tests that the bnb quantization using IA³ works as expected with safetensors weights. + """ + model_id = "facebook/opt-350m" + kwargs = {"device_map": "auto"} + if quantization == "4bit": + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True) + else: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + config = IA3Config(task_type=TaskType.CAUSAL_LM) + peft_model = get_peft_model(model, config) + peft_model = prepare_model_for_kbit_training(peft_model) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(0)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.ia3_l + + @pytest.mark.single_gpu_tests + def test_lora_gptq_quantization_from_pretrained_safetensors(self): + r""" + Tests that the autogptq quantization using LoRA works as expected with safetensors weights. + """ + from transformers import GPTQConfig + + model_id = "marcsun13/opt-350m-gptq-4bit" + quantization_config = GPTQConfig(bits=4, use_exllama=False) + kwargs = { + "pretrained_model_name_or_path": model_id, + "torch_dtype": torch.float16, + "device_map": "auto", + "quantization_config": quantization_config, + } + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig(task_type="CAUSAL_LM") + peft_model = get_peft_model(model, config) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_lora_bnb_4bit_quantization(self): + r""" + Test that tests if the 4bit quantization using LoRA works as expected + """ + whisper_4bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + opt_4bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_lora_config = LoraConfig( + r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = LoraConfig(r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none") + + flan_4bit = get_peft_model(flan_4bit, flan_lora_config) + assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear4bit) + + opt_4bit = get_peft_model(opt_4bit, opt_lora_config) + assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) + + whisper_4bit = get_peft_model(whisper_4bit, config) + assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_vera_bnb_4bit_quantization(self): + r""" + Test that tests if the 4bit quantization using VeRA works as expected + """ + whisper_4bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + opt_4bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_vera_config = VeraConfig( + r=16, target_modules=["q", "v"], vera_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_vera_config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = VeraConfig(r=32, target_modules=["q_proj", "v_proj"], vera_dropout=0.05, bias="none") + + flan_4bit = get_peft_model(flan_4bit, flan_vera_config) + assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, VeraLinear4bit) + + opt_4bit = get_peft_model(opt_4bit, opt_vera_config) + assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit) + + whisper_4bit = get_peft_model(whisper_4bit, config) + assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, VeraLinear4bit) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_randlora_bnb_4bit_quantization(self): + r""" + Test that tests if the 4bit quantization using RandLoRA works as expected + """ + whisper_4bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + opt_4bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_randlora_config = RandLoraConfig( + r=16, target_modules=["q", "v"], randlora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + opt_randlora_config = RandLoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + config = RandLoraConfig(r=32, target_modules=["q_proj", "v_proj"], randlora_dropout=0.05, bias="none") + + flan_4bit = get_peft_model(flan_4bit, flan_randlora_config) + assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RandLoraLinear4bit) + + opt_4bit = get_peft_model(opt_4bit, opt_randlora_config) + assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear4bit) + + whisper_4bit = get_peft_model(whisper_4bit, config) + assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RandLoraLinear4bit) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_ia3_bnb_4bit_quantization(self): + r""" + Test that tests if the 4bit quantization using IA3 works as expected + """ + whisper_4bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + opt_4bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_ia3_config = IA3Config(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") + + opt_ia3_config = IA3Config( + target_modules=["q_proj", "v_proj", "fc2"], + feedforward_modules=["fc2"], + task_type="CAUSAL_LM", + ) + + config = IA3Config(target_modules=["q_proj", "v_proj", "fc2"], feedforward_modules=["fc2"]) + + flan_4bit = get_peft_model(flan_4bit, flan_ia3_config) + assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, IA3Linear4bit) + + opt_4bit = get_peft_model(opt_4bit, opt_ia3_config) + assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit) + + whisper_4bit = get_peft_model(whisper_4bit, config) + assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, IA3Linear4bit) + + @require_bitsandbytes + @pytest.mark.multi_gpu_tests + @pytest.mark.single_gpu_tests + def test_road_bnb_4bit_quantization(self): + r""" + Test that tests if the 4bit quantization using IA3 works as expected + """ + whisper_4bit = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + opt_4bit = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_4bit = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + flan_road_config = RoadConfig(target_modules=["q", "v"], task_type="SEQ_2_SEQ_LM") + + opt_road_config = RoadConfig( + target_modules=["q_proj", "v_proj", "fc2"], + task_type="CAUSAL_LM", + ) + + config = RoadConfig(target_modules=["q_proj", "v_proj", "fc2"]) + + flan_4bit = get_peft_model(flan_4bit, flan_road_config) + assert isinstance(flan_4bit.base_model.model.encoder.block[0].layer[0].SelfAttention.q, RoadLinear4bit) + + opt_4bit = get_peft_model(opt_4bit, opt_road_config) + assert isinstance(opt_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear4bit) + + whisper_4bit = get_peft_model(whisper_4bit, config) + assert isinstance(whisper_4bit.base_model.model.model.decoder.layers[0].self_attn.v_proj, RoadLinear4bit) + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_lora_causal_lm_multi_gpu_inference(self): + r""" + Test if LORA can be used for inference on multiple GPUs. + """ + lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map="balanced") + tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + + model = get_peft_model(model, lora_config) + assert isinstance(model, PeftModel) + + dummy_input = "This is a dummy input:" + input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device) + + # this should work without any problem + _ = model.generate(input_ids=input_ids) + + @require_torch_multi_accelerator + @pytest.mark.multi_gpu_tests + @require_bitsandbytes + def test_lora_seq2seq_lm_multi_gpu_inference(self): + r""" + Test if LORA can be used for inference on multiple GPUs - 8bit version. + """ + lora_config = LoraConfig( + r=16, lora_alpha=32, target_modules=["q", "v"], lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM" + ) + + model = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, device_map="balanced", quantization_config=BitsAndBytesConfig(load_in_8bit=True) + ) + tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + + model = get_peft_model(model, lora_config) + assert isinstance(model, PeftModel) + assert isinstance(model.base_model.model.encoder.block[0].layer[0].SelfAttention.q, LoraLinear8bitLt) + + dummy_input = "This is a dummy input:" + input_ids = tokenizer(dummy_input, return_tensors="pt").input_ids.to(self.device) + + # this should work without any problem + _ = model.generate(input_ids=input_ids) + + @require_torch_multi_accelerator + @pytest.mark.multi_gpu_tests + @require_bitsandbytes + def test_adaption_prompt_8bit(self): + model = LlamaForCausalLM.from_pretrained( + "trl-internal-testing/tiny-random-LlamaForCausalLM", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + torch_dtype=torch.float16, + device_map="auto", + ) + + model = prepare_model_for_kbit_training(model) + + config = AdaptionPromptConfig( + adapter_len=10, + adapter_layers=2, + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + _ = model(random_input) + + @require_torch_multi_accelerator + @pytest.mark.multi_gpu_tests + @require_bitsandbytes + def test_adaption_prompt_4bit(self): + model = LlamaForCausalLM.from_pretrained( + "trl-internal-testing/tiny-random-LlamaForCausalLM", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + torch_dtype=torch.float16, + device_map="auto", + ) + + model = prepare_model_for_kbit_training(model) + + config = AdaptionPromptConfig( + adapter_len=10, + adapter_layers=2, + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + _ = model(random_input) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_print_4bit_expected(self): + EXPECTED_TRAINABLE_PARAMS = 294912 + EXPECTED_ALL_PARAMS = 125534208 + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + config = LoraConfig( + r=8, + ) + model = get_peft_model(model, config) + trainable_params, all_params = model.get_nb_trainable_parameters() + + assert trainable_params == EXPECTED_TRAINABLE_PARAMS + assert all_params == EXPECTED_ALL_PARAMS + + # test with double quant + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + ) + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + ) + + config = LoraConfig( + r=8, + ) + model = get_peft_model(model, config) + trainable_params, all_params = model.get_nb_trainable_parameters() + + assert trainable_params == EXPECTED_TRAINABLE_PARAMS + assert all_params == EXPECTED_ALL_PARAMS + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_modules_to_save_grad(self): + model_id = "bigscience/bloomz-560m" + + model = AutoModelForSequenceClassification.from_pretrained( + model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + torch_dtype=torch.float32, + ) + + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=16, + lora_dropout=0.05, + bias="none", + task_type="SEQ_CLS", + ) + + peft_model = get_peft_model(model, config) + + lm_head = peft_model.base_model.model.score + original_module = lm_head.original_module + modules_to_save = lm_head.modules_to_save.default + + inputs = torch.randn(1024).to(model.device) + o1 = lm_head(inputs) + o1.mean().backward() + + assert modules_to_save.weight.requires_grad is True + assert original_module.weight.grad is None + assert modules_to_save.weight.grad is not None + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_merge_lora(self): + torch.manual_seed(1000) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before_merge = F.softmax(model(random_input).logits, dim=-1) + + model.merge_and_unload() + with torch.inference_mode(): + out_after_merge = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-3 + rtol = 1 + assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) + assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) + assert isinstance(model, PeftModel) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear8bitLt) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear8bitLt) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_merge_and_disable_lora(self): + torch.manual_seed(1000) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before = F.softmax(model(random_input).logits, dim=-1) + + model.merge_adapter() + with model.disable_adapter(): + with torch.inference_mode(): + out_after = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-3 + rtol = 1 + assert not torch.allclose(out_base, out_before, atol=atol, rtol=rtol) + assert torch.allclose(out_base, out_after, atol=atol, rtol=rtol) + assert isinstance(model, PeftModel) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_merge_lora_with_bias(self): + # same as test_8bit_merge_lora but with lora_bias=True + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + lora_bias=True, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before_merge = F.softmax(model(random_input).logits, dim=-1) + + model.merge_and_unload() + with torch.inference_mode(): + out_after_merge = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-3 + rtol = 1 + assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) + assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_merge_lora(self): + torch.manual_seed(3000) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before_merge = F.softmax(model(random_input).logits, dim=-1) + + model.merge_and_unload() + with torch.inference_mode(): + out_after_merge = F.softmax(model(random_input).logits, dim=-1) + + # tolerances are pretty high because some deviations are expected with quantization + atol = 0.01 + rtol = 10 + assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) + assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) + assert isinstance(model, PeftModel) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, bnb.nn.Linear4bit) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, bnb.nn.Linear4bit) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_merge_and_disable_lora(self): + torch.manual_seed(3000) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before = F.softmax(model(random_input).logits, dim=-1) + + model.merge_adapter() + with model.disable_adapter(): + with torch.inference_mode(): + out_after = F.softmax(model(random_input).logits, dim=-1) + + atol = 0.01 + rtol = 10 + assert not torch.allclose(out_base, out_before, atol=atol, rtol=rtol) + assert torch.allclose(out_base, out_after, atol=atol, rtol=rtol) + assert isinstance(model, PeftModel) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_merge_lora_with_bias(self): + # same as test_4bit_merge_lora but with lora_bias=True + torch.manual_seed(3000) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + lora_bias=True, + ) + model = get_peft_model(model, config) + + with torch.inference_mode(): + out_before_merge = F.softmax(model(random_input).logits, dim=-1) + + model.merge_and_unload() + with torch.inference_mode(): + out_after_merge = F.softmax(model(random_input).logits, dim=-1) + + # tolerances are pretty high because some deviations are expected with quantization + atol = 0.01 + rtol = 10 + assert not torch.allclose(out_base, out_before_merge, atol=atol, rtol=rtol) + assert torch.allclose(out_before_merge, out_after_merge, atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_lora_mixed_adapter_batches_lora(self): + # check that we can pass mixed adapter names to the model + torch.manual_seed(3000) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ).eval() + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + # input with 9 samples + inputs = tokenizer( + [ + "Hello, my dog is cute", + "Hello, my cat is awesome", + "Hello, my fish is great", + "Salut, mon chien est mignon", + "Salut, mon chat est génial", + "Salut, mon poisson est super", + "Hallo, mein Hund ist süß", + "Hallo, meine Katze ist toll", + "Hallo, mein Fisch ist großartig", + ], + return_tensors="pt", + padding=True, + ).to(model.device) + with torch.inference_mode(): + out_base = model(**inputs).logits + + config0 = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config0).eval() + with torch.inference_mode(): + out_adapter0 = model(**inputs).logits + + config1 = LoraConfig( + r=16, + init_lora_weights=False, + ) + model.add_adapter("adapter1", config1) + model.set_adapter("adapter1") + with torch.inference_mode(): + out_adapter1 = model(**inputs).logits + + atol, rtol = 3e-5, 1e-5 + # sanity check, outputs have the right shape and are not the same + assert len(out_base) >= 3 + assert len(out_base) == len(out_adapter0) == len(out_adapter1) + assert not torch.allclose(out_base, out_adapter0, atol=atol, rtol=rtol) + assert not torch.allclose(out_base, out_adapter1, atol=atol, rtol=rtol) + assert not torch.allclose(out_adapter0, out_adapter1, atol=atol, rtol=rtol) + + # mixed adapter batch + adapters = ["__base__", "default", "adapter1"] + adapter_names = [adapters[i % 3] for i in (range(9))] + with torch.inference_mode(): + out_mixed = model(**inputs, adapter_names=adapter_names).logits + + assert torch.allclose(out_base[::3], out_mixed[::3], atol=atol, rtol=rtol) + assert torch.allclose(out_adapter0[1::3], out_mixed[1::3], atol=atol, rtol=rtol) + assert torch.allclose(out_adapter1[2::3], out_mixed[2::3], atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_lora_mixed_adapter_batches_lora(self): + # check that we can pass mixed adapter names to the model + # note that with 8bit, we have quite a bit of imprecision, therefore we use softmax and higher tolerances + torch.manual_seed(3000) + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ).eval() + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + # input with 9 samples + inputs = tokenizer( + [ + "Hello, my dog is cute", + "Hello, my cat is awesome", + "Hello, my fish is great", + "Salut, mon chien est mignon", + "Salut, mon chat est génial", + "Salut, mon poisson est super", + "Hallo, mein Hund ist süß", + "Hallo, meine Katze ist toll", + "Hallo, mein Fisch ist großartig", + ], + return_tensors="pt", + padding=True, + ).to(model.device) + with torch.inference_mode(): + out_base = F.softmax(model(**inputs).logits, dim=-1) + + config0 = LoraConfig( + r=8, + init_lora_weights=False, + ) + model = get_peft_model(model, config0).eval() + with torch.inference_mode(): + out_adapter0 = F.softmax(model(**inputs).logits, dim=-1) + + config1 = LoraConfig( + r=16, + init_lora_weights=False, + ) + model.add_adapter("adapter1", config1) + model.set_adapter("adapter1") + with torch.inference_mode(): + out_adapter1 = F.softmax(model(**inputs).logits, dim=-1) + + atol = 0.01 + rtol = 0.5 + # sanity check, outputs have the right shape and are not the same + assert len(out_base) >= 3 + assert len(out_base) == len(out_adapter0) == len(out_adapter1) + assert not torch.allclose(out_base, out_adapter0, atol=atol, rtol=rtol) + assert not torch.allclose(out_base, out_adapter1, atol=atol, rtol=rtol) + assert not torch.allclose(out_adapter0, out_adapter1, atol=atol, rtol=rtol) + + # mixed adapter batch + adapters = ["__base__", "default", "adapter1"] + adapter_names = [adapters[i % 3] for i in (range(9))] + with torch.inference_mode(): + out_mixed = F.softmax(model(**inputs, adapter_names=adapter_names).logits, dim=-1) + + assert torch.allclose(out_base[::3], out_mixed[::3], atol=atol, rtol=rtol) + assert torch.allclose(out_adapter0[1::3], out_mixed[1::3], atol=atol, rtol=rtol) + assert torch.allclose(out_adapter1[2::3], out_mixed[2::3], atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + def test_serialization_shared_tensors(self): + model_checkpoint = "roberta-base" + peft_config = LoraConfig( + task_type=TaskType.TOKEN_CLS, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias="all" + ) + model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=11).to(self.device) + model = get_peft_model(model, peft_config) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, safe_serialization=True) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_deterministic_for_xpu + @require_bitsandbytes + def test_4bit_dora_inference(self): + # check for same result with and without DoRA when initializing with init_lora_weights=False + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + + torch.manual_seed(0) + config_lora = LoraConfig(r=8, init_lora_weights=False, use_dora=False) + model = get_peft_model(model, config_lora).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + logits_lora = model(random_input).logits + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ) + torch.manual_seed(0) + config_dora = LoraConfig(r=8, init_lora_weights=False, use_dora=True) + model = get_peft_model(model, config_dora).eval() + + logits_dora = model(random_input).logits + + assert torch.allclose(logits_lora, logits_dora) + # sanity check + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear4bit) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear4bit) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_deterministic_for_xpu + @require_bitsandbytes + def test_8bit_dora_inference(self): + # check for same result with and without DoRA when initializing with init_lora_weights=False + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + torch_dtype=torch.float32, + ).eval() + + torch.manual_seed(0) + config_lora = LoraConfig(r=8, init_lora_weights=False, use_dora=False) + model = get_peft_model(model, config_lora).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + logits_lora = model(random_input).logits + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + torch_dtype=torch.float32, + ) + torch.manual_seed(0) + config_dora = LoraConfig(r=8, init_lora_weights=False, use_dora=True) + model = get_peft_model(model, config_dora).eval() + + logits_dora = model(random_input).logits + + assert torch.allclose(logits_lora, logits_dora) + # sanity check + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.q_proj, LoraLinear8bitLt) + assert isinstance(model.base_model.model.model.decoder.layers[0].self_attn.v_proj, LoraLinear8bitLt) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_dora_merging(self): + # Check results for merging, unmerging, unloading + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "trl-internal-testing/tiny-random-LlamaForCausalLM", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ).eval() + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + use_dora=True, + ) + model = get_peft_model(model, config).eval() + + # Note: By default, DoRA is a no-op before training, even if we set init_lora_weights=False. In order to + # measure any differences, we need to change the magnitude vector. + for name, module in model.named_modules(): + if isinstance(module, LoraLinear4bit): + module.lora_magnitude_vector["default"].weight = torch.nn.Parameter( + 10 * torch.rand_like(module.lora_magnitude_vector["default"].weight) + ) + + with torch.inference_mode(): + out_dora = F.softmax(model(random_input).logits, dim=-1) + + model.merge_adapter() + out_merged = F.softmax(model(random_input).logits, dim=-1) + + model.unmerge_adapter() + out_unmerged = F.softmax(model(random_input).logits, dim=-1) + + model = model.merge_and_unload() + out_unloaded = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-5 + rtol = 1e-3 + # sanity check that using DoRA changes the results + assert not torch.allclose(out_base, out_dora, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_merged, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_unloaded, atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_dora_merging(self): + # Check results for merging, unmerging, unloading + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + torch_dtype=torch.float32, + ).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = LoraConfig( + r=8, + init_lora_weights=False, + use_dora=True, + ) + model = get_peft_model(model, config).eval() + + # Note: By default, DoRA is a no-op before training, even if we set init_lora_weights=False. In order to + # measure any differences, we need to change the magnitude vector. + for name, module in model.named_modules(): + if isinstance(module, LoraLinear8bitLt): + module.lora_magnitude_vector["default"].weight = torch.nn.Parameter( + 10 * torch.rand_like(module.lora_magnitude_vector["default"].weight) + ) + + with torch.inference_mode(): + out_dora = F.softmax(model(random_input).logits, dim=-1) + + model.merge_adapter() + out_merged = F.softmax(model(random_input).logits, dim=-1) + + model.unmerge_adapter() + out_unmerged = F.softmax(model(random_input).logits, dim=-1) + + model = model.merge_and_unload() + out_unloaded = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-3 + rtol = 1 + # sanity check that using DoRA changes the results + assert not torch.allclose(out_base, out_dora, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_merged, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(out_dora, out_unloaded, atol=atol, rtol=rtol) + + @pytest.mark.single_gpu_tests + def test_dora_ephemeral_gpu_offload(self): + torch.manual_seed(0) + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + torch_dtype=torch.float32, + ).eval() + + config = LoraConfig( + r=128, + init_lora_weights=False, + use_dora=True, + runtime_config=LoraRuntimeConfig( + ephemeral_gpu_offload=True + ), # we enable this, but only to verify that it's gone later + ) + peft_model = get_peft_model(model, config).eval() + # Check that ephemeral GPU offloading is present + assert peft_model.peft_config["default"].runtime_config.ephemeral_gpu_offload + + # Save to disk + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + + # Load from disk 100% on CPU without ephemeral GPU offloading + peft_model_cpu = PeftModel.from_pretrained( + model, + tmp_dir, + device_map={"": "cpu"}, + ).eval() + + # Check that ephemeral GPU offloading is absent + assert not peft_model_cpu.peft_config["default"].runtime_config.ephemeral_gpu_offload + + # Load again, with ephemeral GPU offloading enabled + peft_model_ego = PeftModel.from_pretrained( + model, + tmp_dir, + device_map={"": "cpu"}, + ephemeral_gpu_offload=True, + ).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + with torch.inference_mode(): + out_peft_model_cpu = F.softmax(peft_model_cpu(random_input).logits, dim=-1) + out_peft_model_ego = F.softmax(peft_model_ego(random_input).logits, dim=-1) + + # The results should be the same + assert torch.allclose(out_peft_model_cpu, out_peft_model_ego) + + @require_torch_multi_accelerator + @pytest.mark.multi_gpu_tests + def test_dora_ephemeral_gpu_offload_multigpu(self): + torch.manual_seed(0) + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + torch_dtype=torch.float32, + ).eval() + + config = LoraConfig( + r=16, # too small and the time difference is too small + init_lora_weights=False, + use_dora=True, + runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True), + ) + peft_model = get_peft_model(model, config).eval() + + layer = peft_model.base_model.model.model.decoder.layers[0].self_attn.v_proj + lora_A, lora_B = layer.lora_A, layer.lora_B + + possible_combinations = ["cpu", self.device, f"{self.device}:0", f"{self.device}:1"] + adapter_name = layer.active_adapter[0] + for device_A in possible_combinations: + la = lora_A.to(device_A) + for device_B in possible_combinations: + lb = lora_B.to(device_B) + layer.lora_A, layer.lora_B = la, lb + layer.lora_variant[adapter_name].init(layer, adapter_name=adapter_name) # should not raise an error + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_8bit_road_merging(self): + # Check results for merging, unmerging, unloading + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + torch_dtype=torch.float32, + ).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = F.softmax(model(random_input).logits, dim=-1) + + config = RoadConfig( + init_weights=False, + ) + model = get_peft_model(model, config).eval() + + with torch.inference_mode(): + out_road = F.softmax(model(random_input).logits, dim=-1) + + model.merge_adapter() + out_merged = F.softmax(model(random_input).logits, dim=-1) + + model.unmerge_adapter() + out_unmerged = F.softmax(model(random_input).logits, dim=-1) + + model = model.merge_and_unload() + out_unloaded = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-3 + rtol = 1 + # sanity check that using DoRA changes the results + assert not torch.allclose(out_base, out_road, atol=atol, rtol=rtol) + assert torch.allclose(out_road, out_merged, atol=atol, rtol=rtol) + assert torch.allclose(out_road, out_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(out_road, out_unloaded, atol=atol, rtol=rtol) + + @require_non_cpu + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_4bit_road_merging(self): + # Check results for merging, unmerging, unloading + torch.manual_seed(0) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=False, + bnb_4bit_compute_dtype=torch.float32, + ) + model = AutoModelForCausalLM.from_pretrained( + "trl-internal-testing/tiny-random-LlamaForCausalLM", + quantization_config=bnb_config, + torch_dtype=torch.float32, + ).eval() + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + # compare outputs in probability space, because logits can have outliers + # and token ids are not precise enough + out_base = model(random_input).logits + probs_base = F.softmax(out_base, dim=-1) + + config = RoadConfig( + init_weights=False, + group_size=4, + ) + model = get_peft_model(model, config).eval() + + with torch.inference_mode(): + out_road = model(random_input).logits + probs_road = F.softmax(out_road, dim=-1) + + model.merge_adapter() + probs_merged = F.softmax(model(random_input).logits, dim=-1) + + model.unmerge_adapter() + probs_unmerged = F.softmax(model(random_input).logits, dim=-1) + + model = model.merge_and_unload() + probs_unloaded = F.softmax(model(random_input).logits, dim=-1) + + atol = 1e-5 + rtol = 1e-3 + # sanity check that using DoRA changes the results + # we compare outputs instead of logits because they may not be sensitive enough + assert not torch.allclose(out_base, out_road, atol=atol, rtol=rtol) + assert torch.allclose(probs_road, probs_merged, atol=atol, rtol=rtol) + assert torch.allclose(probs_road, probs_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(probs_road, probs_unloaded, atol=atol, rtol=rtol) + + def test_apply_GS_hra_inference(self): + # check for different result with and without apply_GS + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + torch_dtype=torch.float32, + ).eval() + + torch.manual_seed(0) + config_hra = HRAConfig(r=8, init_weights=True, apply_GS=False) + model = get_peft_model(model, config_hra).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + logits_hra = model(random_input).logits + + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + torch_dtype=torch.float32, + ) + torch.manual_seed(0) + config_hra_GS = HRAConfig(r=8, init_weights=True, apply_GS=True) + model = get_peft_model(model, config_hra_GS) + + logits_hra_GS = model(random_input).logits + + assert not torch.allclose(logits_hra, logits_hra_GS) + + @require_non_cpu + @pytest.mark.single_gpu_tests + def test_apply_GS_hra_conv2d_inference(self): + # check for different result with and without apply_GS + model_id = "microsoft/resnet-18" + image_processor = AutoImageProcessor.from_pretrained(model_id) + image = load_cat_image() + data = image_processor(image, return_tensors="pt") + + model = AutoModelForImageClassification.from_pretrained(model_id).eval() + torch.manual_seed(0) + config_hra = HRAConfig(r=8, init_weights=True, target_modules=["convolution"], apply_GS=False) + model = get_peft_model(model, config_hra).eval() + + logits_hra = model(**data).logits + + model = AutoModelForImageClassification.from_pretrained(model_id).eval() + torch.manual_seed(0) + config_hra_GS = HRAConfig(r=8, init_weights=True, target_modules=["convolution"], apply_GS=True) + model = get_peft_model(model, config_hra_GS) + + logits_hra_GS = model(**data).logits + + assert not torch.allclose(logits_hra, logits_hra_GS) + + @require_non_cpu + @pytest.mark.single_gpu_tests + def test_r_odd_hra_inference(self): + # check that an untrained HRA adapter can't be initialized as an identity tranformation + # when r is an odd number + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + torch_dtype=torch.float32, + ).eval() + + random_input = torch.LongTensor([[1, 0, 1, 0, 1, 0]]).to(model.device) + + torch.manual_seed(0) + logits = model(random_input).logits + + config_hra = HRAConfig(r=7, init_weights=True, apply_GS=False) + model = get_peft_model(model, config_hra).eval() + logits_hra = model(random_input).logits + + assert not torch.allclose(logits, logits_hra) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a GPU or XPU") +@pytest.mark.single_gpu_tests +class TestSameAdapterDifferentDevices: + device = infer_device() + + # 1639 + # The original issue comes down to the following problem: If the user has a base layer on CUDA, moves the adapter to + # CPU, then adds another adapter (which will automatically be moved to CUDA), then the first adapter will also be + # moved to CUDA. + @pytest.fixture + def mlp(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(8, 32, bias=bias) + self.lin1 = nn.Linear(32, 2, bias=bias) + + return MLP() + + @pytest.fixture + def emb_conv1d(self): + class ModelEmbConv1D(nn.Module): + def __init__(self, emb_size=100): + super().__init__() + self.emb = nn.Embedding(emb_size, 5) + self.conv1d = Conv1D(1, 5) + + return ModelEmbConv1D() + + @pytest.fixture + def conv2d(self): + class ModelConv2D(nn.Module): + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(5, 10, 3) + + return ModelConv2D() + + def test_lora_one_target_add_new_adapter_does_not_change_device(self, mlp): + config = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.lora_A.cpu() + model.lin0.lora_B.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.lora_A.default.weight.device.type == "cpu" + assert model.lin0.lora_B.default.weight.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.lora_A.default.weight.device.type == "cpu" + assert model.lin0.lora_B.default.weight.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device + + def test_lora_multiple_targets_add_new_adapater_does_not_change_device(self, mlp): + # same as the previous test, but targeting multiple layers + config = LoraConfig(target_modules=["lin0", "lin1"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + # move lin1 to CPU but leave lin0 on GPU + model.lin1.lora_A.cpu() + model.lin1.lora_B.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin1.lora_A.default.weight.device.type == "cpu" + assert model.lin1.lora_B.default.weight.device.type == "cpu" + assert model.lin1.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.default.weight.device.type == self.device + assert model.lin0.lora_B.default.weight.device.type == self.device + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin1.lora_A.default.weight.device.type == "cpu" + assert model.lin1.lora_B.default.weight.device.type == "cpu" + assert model.lin1.base_layer.weight.device.type == self.device + # the rest should be on GPU + assert model.lin0.lora_A.default.weight.device.type == self.device + assert model.lin0.lora_B.default.weight.device.type == self.device + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device + assert model.lin1.lora_A.other.weight.device.type == self.device + assert model.lin1.lora_B.other.weight.device.type == self.device + + def test_lora_embedding_target_add_new_adapter_does_not_change_device(self, emb_conv1d): + # same as first test, but targeting the embedding layer + config = LoraConfig(target_modules=["emb"]) + model = get_peft_model(emb_conv1d, config) + model = model.to(self.device) + model.emb.lora_embedding_A.cpu() + model.emb.lora_embedding_B.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.emb.lora_embedding_A.default.device.type == "cpu" + assert model.emb.lora_embedding_B.default.device.type == "cpu" + assert model.emb.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.emb.lora_embedding_A.default.device.type == "cpu" + assert model.emb.lora_embedding_B.default.device.type == "cpu" + # the rest should be on GPU + assert model.emb.weight.device.type == self.device + assert model.emb.lora_embedding_A.other.device.type == self.device + assert model.emb.lora_embedding_B.other.device.type == self.device + + def test_lora_conv1d_target_add_new_adapter_does_not_change_device(self, emb_conv1d): + # same as first test, but targeting the Conv1D layer + config = LoraConfig(target_modules=["conv1d"]) + model = get_peft_model(emb_conv1d, config) + model = model.to(self.device) + model.conv1d.lora_A.cpu() + model.conv1d.lora_B.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.conv1d.lora_A.default.weight.device.type == "cpu" + assert model.conv1d.lora_B.default.weight.device.type == "cpu" + assert model.conv1d.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.conv1d.lora_A.default.weight.device.type == "cpu" + assert model.conv1d.lora_B.default.weight.device.type == "cpu" + # the rest should be on GPU + assert model.conv1d.weight.device.type == self.device + assert model.conv1d.lora_A.other.weight.device.type == self.device + assert model.conv1d.lora_B.other.weight.device.type == self.device + + def test_lora_dora_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but also using DoRA + config = LoraConfig(target_modules=["lin0"], use_dora=True) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.lora_A.cpu() + model.lin0.lora_B.cpu() + model.lin0.lora_magnitude_vector.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.lora_A.default.weight.device.type == "cpu" + assert model.lin0.lora_B.default.weight.device.type == "cpu" + assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.lora_A.default.weight.device.type == "cpu" + assert model.lin0.lora_B.default.weight.device.type == "cpu" + assert model.lin0.lora_magnitude_vector.default.weight.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.weight.device.type == self.device + assert model.lin0.lora_B.other.weight.device.type == self.device + assert model.lin0.lora_magnitude_vector.other.weight.device.type == self.device + + def test_adalora_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using AdaLORA + # AdaLora does not like multiple trainable adapters, hence inference_mode=True + config = AdaLoraConfig(target_modules=["lin0"], inference_mode=True, total_step=1) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.lora_A.cpu() + model.lin0.lora_E.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.lora_A.default.device.type == "cpu" + assert model.lin0.lora_E.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.lora_A.default.device.type == "cpu" + assert model.lin0.lora_E.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lora_A.other.device.type == self.device + assert model.lin0.lora_E.other.device.type == self.device + + def test_boft_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using BoFT + config = BOFTConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.boft_R.cpu() + model.lin0.boft_s.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.boft_R.default.device.type == "cpu" + assert model.lin0.boft_s.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.boft_R.default.device.type == "cpu" + assert model.lin0.boft_s.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.boft_R.other.device.type == self.device + assert model.lin0.boft_s.other.device.type == self.device + + def test_ia3_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using IA3 + config = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.ia3_l.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.ia3_l.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.ia3_l.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.ia3_l.other.device.type == self.device + + @pytest.mark.xfail(reason="LN Tuning handling of multiple adapters may not be correct", strict=True) + def test_ln_tuning_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using LN tuning + config = LNTuningConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.ln_tuning_layers.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.ln_tuning_layers.default.weight.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.ln_tuning_layers.other.weight.device.type == self.device + + def test_loha_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using LoHa + config = LoHaConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.hada_w1_a.cpu() + model.lin0.hada_w2_b.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.hada_w1_a.default.device.type == "cpu" + assert model.lin0.hada_w2_b.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.hada_w1_a.default.device.type == "cpu" + assert model.lin0.hada_w2_b.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.hada_w1_a.other.device.type == self.device + assert model.lin0.hada_w2_b.other.device.type == self.device + + def test_lokr_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using LoKr + config = LoKrConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.lokr_w1.cpu() + model.lin0.lokr_w2.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.lokr_w1.default.device.type == "cpu" + assert model.lin0.lokr_w2.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.lokr_w1.default.device.type == "cpu" + assert model.lin0.lokr_w2.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.lokr_w1.other.device.type == self.device + assert model.lin0.lokr_w2.other.device.type == self.device + + def test_oft_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using OFT + config = OFTConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.oft_R.default.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.oft_R.default.weight.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.oft_R.default.weight.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.oft_R.other.weight.device.type == self.device + + def test_vera_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using VERA + config = VeraConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.vera_A.cpu() + model.lin0.vera_lambda_d.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.vera_A.default.device.type == "cpu" + assert model.lin0.vera_lambda_d.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.vera_A.default.device.type == "cpu" + assert model.lin0.vera_lambda_d.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.vera_A.other.device.type == self.device + assert model.lin0.vera_lambda_d.other.device.type == self.device + + def test_randlora_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using RandLora + config = RandLoraConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.randlora_A.cpu() + model.lin0.randlora_lambda.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.randlora_A.default.device.type == "cpu" + assert model.lin0.randlora_lambda.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.randlora_A.default.device.type == "cpu" + assert model.lin0.randlora_lambda.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.randlora_A.other.device.type == self.device + assert model.lin0.randlora_lambda.other.device.type == self.device + + def test_vblora_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using VBLoRA + config = VBLoRAConfig(target_modules=["lin0"], vector_length=2) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.vblora_logits_A.cpu() + model.lin0.vblora_logits_B.cpu() + model.lin0.vblora_vector_bank.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.vblora_logits_A.default.device.type == "cpu" + assert model.lin0.vblora_logits_B.default.device.type == "cpu" + assert model.lin0.vblora_vector_bank.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.vblora_logits_A.default.device.type == "cpu" + assert model.lin0.vblora_logits_B.default.device.type == "cpu" + assert model.lin0.vblora_vector_bank.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.vblora_logits_A.other.device.type == self.device + assert model.lin0.vblora_logits_B.other.device.type == self.device + assert model.lin0.vblora_vector_bank.other.device.type == self.device + + def test_hra_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using HRA + config = HRAConfig(target_modules=["lin0"]) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.hra_u.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.hra_u.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.hra_u.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.hra_u.other.device.type == self.device + + def test_road_add_new_adapter_does_not_change_device(self, mlp): + # same as first test, but using HRA + config = RoadConfig(target_modules=["lin0"], group_size=2) + model = get_peft_model(mlp, config) + model = model.to(self.device) + model.lin0.road_theta.cpu() + + # check that the adapter is indeed on CPU and the base model on GPU + assert model.lin0.road_theta.default.device.type == "cpu" + assert model.lin0.base_layer.weight.device.type == self.device + + model.add_adapter("other", config) + # check that after adding a new adapter, the old adapter is still on CPU + assert model.lin0.road_theta.default.device.type == "cpu" + # the rest should be on GPU + assert model.lin0.base_layer.weight.device.type == self.device + assert model.lin0.road_theta.other.device.type == self.device diff --git a/peft/tests/test_config.py b/peft/tests/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a6d8cffbd5c37e12c31095f33bb0d22f8af07e2 --- /dev/null +++ b/peft/tests/test_config.py @@ -0,0 +1,592 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import json +import os +import pickle +import tempfile +import warnings + +import pytest + +from peft import ( + AdaLoraConfig, + AdaptionPromptConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + MissConfig, + MultitaskPromptTuningConfig, + OFTConfig, + PeftConfig, + PeftType, + PolyConfig, + PrefixTuningConfig, + PromptEncoder, + PromptEncoderConfig, + PromptTuningConfig, + RoadConfig, + ShiraConfig, + TaskType, + TrainableTokensConfig, + VBLoRAConfig, + VeraConfig, + XLoraConfig, +) + + +PEFT_MODELS_TO_TEST = [("peft-internal-testing/tiny-opt-lora-revision", "test")] + +# Config classes and their mandatory parameters +ALL_CONFIG_CLASSES = ( + (AdaLoraConfig, {"total_step": 1}), + (AdaptionPromptConfig, {}), + (BOFTConfig, {}), + (BoneConfig, {}), + (C3AConfig, {}), + (FourierFTConfig, {}), + (HRAConfig, {}), + (IA3Config, {}), + (LNTuningConfig, {}), + (LoHaConfig, {}), + (LoKrConfig, {}), + (LoraConfig, {}), + (MissConfig, {}), + (MultitaskPromptTuningConfig, {}), + (PolyConfig, {}), + (PrefixTuningConfig, {}), + (PromptEncoderConfig, {}), + (PromptTuningConfig, {}), + (RoadConfig, {}), + (ShiraConfig, {}), + (TrainableTokensConfig, {}), + (VeraConfig, {}), + (VBLoRAConfig, {}), + (XLoraConfig, {"hidden_size": 32, "adapters": {}}), +) + + +class TestPeftConfig: + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_methods(self, config_class, mandatory_kwargs): + r""" + Test if all configs have the expected methods. Here we test + - to_dict + - save_pretrained + - from_pretrained + - from_json_file + """ + # test if all configs have the expected methods + config = config_class(**mandatory_kwargs) + assert hasattr(config, "to_dict") + assert hasattr(config, "save_pretrained") + assert hasattr(config, "from_pretrained") + assert hasattr(config, "from_json_file") + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + @pytest.mark.parametrize("valid_task_type", list(TaskType) + [None]) + def test_valid_task_type(self, config_class, mandatory_kwargs, valid_task_type): + r""" + Test if all configs work correctly for all valid task types + """ + config_class(task_type=valid_task_type, **mandatory_kwargs) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_invalid_task_type(self, config_class, mandatory_kwargs): + r""" + Test if all configs correctly raise the defined error message for invalid task types. + """ + invalid_task_type = "invalid-task-type" + with pytest.raises( + ValueError, + match=f"Invalid task type: '{invalid_task_type}'. Must be one of the following task types: {', '.join(TaskType)}.", + ): + config_class(task_type=invalid_task_type, **mandatory_kwargs) + + def test_from_peft_type(self): + r""" + Test if the config is correctly loaded using: + - from_peft_type + """ + from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + for peft_type in PeftType: + expected_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] + mandatory_config_kwargs = {} + + if expected_cls == AdaLoraConfig: + mandatory_config_kwargs = {"total_step": 1} + + config = PeftConfig.from_peft_type(peft_type=peft_type, **mandatory_config_kwargs) + assert type(config) is expected_cls + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_pretrained(self, config_class, mandatory_kwargs): + r""" + Test if the config is correctly loaded using: + - from_pretrained + """ + for model_name, revision in PEFT_MODELS_TO_TEST: + # Test we can load config from delta + config_class.from_pretrained(model_name, revision=revision) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_save_pretrained(self, config_class, mandatory_kwargs): + r""" + Test if the config is correctly saved and loaded using + - save_pretrained + """ + config = config_class(**mandatory_kwargs) + with tempfile.TemporaryDirectory() as tmp_dirname: + config.save_pretrained(tmp_dirname) + + config_from_pretrained = config_class.from_pretrained(tmp_dirname) + assert config.to_dict() == config_from_pretrained.to_dict() + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_json_file(self, config_class, mandatory_kwargs): + config = config_class(**mandatory_kwargs) + with tempfile.TemporaryDirectory() as tmp_dirname: + config.save_pretrained(tmp_dirname) + + config_path = os.path.join(tmp_dirname, "adapter_config.json") + config_from_json = config_class.from_json_file(config_path) + assert config.to_dict() == config_from_json + + # Also test with a runtime_config entry -- they should be ignored, even if they + # were accidentally saved to disk + config_from_json["runtime_config"] = {"ephemeral_gpu_offload": True} + json.dump(config_from_json, open(config_path, "w")) + + config_from_json = config_class.from_json_file(config_path) + assert config.to_dict() == config_from_json + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_to_dict(self, config_class, mandatory_kwargs): + r""" + Test if the config can be correctly converted to a dict using: + - to_dict + """ + config = config_class(**mandatory_kwargs) + assert isinstance(config.to_dict(), dict) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_pretrained_cache_dir(self, config_class, mandatory_kwargs): + r""" + Test if the config is correctly loaded with extra kwargs + """ + with tempfile.TemporaryDirectory() as tmp_dirname: + for model_name, revision in PEFT_MODELS_TO_TEST: + # Test we can load config from delta + config_class.from_pretrained(model_name, revision=revision, cache_dir=tmp_dirname) + + def test_from_pretrained_cache_dir_remote(self): + r""" + Test if the config is correctly loaded with a checkpoint from the hub + """ + with tempfile.TemporaryDirectory() as tmp_dirname: + PeftConfig.from_pretrained("ybelkada/test-st-lora", cache_dir=tmp_dirname) + assert "models--ybelkada--test-st-lora" in os.listdir(tmp_dirname) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_save_pretrained_with_runtime_config(self, config_class, mandatory_kwargs): + r""" + Test if the config correctly removes runtime config when saving + """ + with tempfile.TemporaryDirectory() as tmp_dirname: + for model_name, revision in PEFT_MODELS_TO_TEST: + cfg = config_class.from_pretrained(model_name, revision=revision) + # NOTE: cfg is always a LoraConfig here, because the configuration of the loaded model was a LoRA. + # Hence we can expect a runtime_config to exist regardless of config_class. + cfg.runtime_config.ephemeral_gpu_offload = True + cfg.save_pretrained(tmp_dirname) + cfg = config_class.from_pretrained(tmp_dirname) + assert not cfg.runtime_config.ephemeral_gpu_offload + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_set_attributes(self, config_class, mandatory_kwargs): + # manually set attributes and check if they are correctly written + config = config_class(peft_type="test", **mandatory_kwargs) + + # save pretrained + with tempfile.TemporaryDirectory() as tmp_dirname: + config.save_pretrained(tmp_dirname) + + config_from_pretrained = config_class.from_pretrained(tmp_dirname) + assert config.to_dict() == config_from_pretrained.to_dict() + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_config_copy(self, config_class, mandatory_kwargs): + # see https://github.com/huggingface/peft/issues/424 + config = config_class(**mandatory_kwargs) + copied = copy.copy(config) + assert config.to_dict() == copied.to_dict() + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_config_deepcopy(self, config_class, mandatory_kwargs): + # see https://github.com/huggingface/peft/issues/424 + config = config_class(**mandatory_kwargs) + copied = copy.deepcopy(config) + assert config.to_dict() == copied.to_dict() + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_config_pickle_roundtrip(self, config_class, mandatory_kwargs): + # see https://github.com/huggingface/peft/issues/424 + config = config_class(**mandatory_kwargs) + copied = pickle.loads(pickle.dumps(config)) + assert config.to_dict() == copied.to_dict() + + def test_prompt_encoder_warning_num_layers(self): + # This test checks that if a prompt encoder config is created with an argument that is ignored, there should be + # warning. However, there should be no warning if the default value is used. + kwargs = { + "num_virtual_tokens": 20, + "num_transformer_submodules": 1, + "token_dim": 768, + "encoder_hidden_size": 768, + } + + # there should be no warning with just default argument for encoder_num_layer + config = PromptEncoderConfig(**kwargs) + with warnings.catch_warnings(): + PromptEncoder(config) + + # when changing encoder_num_layer, there should be a warning for MLP since that value is not used + config = PromptEncoderConfig(encoder_num_layers=123, **kwargs) + with pytest.warns(UserWarning) as record: + PromptEncoder(config) + expected_msg = "for MLP, the argument `encoder_num_layers` is ignored. Exactly 2 MLP layers are used." + assert str(record.list[0].message) == expected_msg + + @pytest.mark.parametrize( + "config_class", [LoHaConfig, LoraConfig, IA3Config, OFTConfig, BOFTConfig, HRAConfig, VBLoRAConfig] + ) + def test_save_pretrained_with_target_modules(self, config_class): + # See #1041, #1045 + config = config_class(target_modules=["a", "list"]) + with tempfile.TemporaryDirectory() as tmp_dirname: + config.save_pretrained(tmp_dirname) + + config_from_pretrained = config_class.from_pretrained(tmp_dirname) + assert config.to_dict() == config_from_pretrained.to_dict() + # explicit test that target_modules should be converted to set + assert isinstance(config_from_pretrained.target_modules, set) + + def test_regex_with_layer_indexing_lora(self): + # This test checks that an error is raised if `target_modules` is a regex expression and `layers_to_transform` or + # `layers_pattern` are not None + + invalid_config1 = {"target_modules": ".*foo", "layers_to_transform": [0]} + invalid_config2 = {"target_modules": ".*foo", "layers_pattern": ["bar"]} + + valid_config = {"target_modules": ["foo"], "layers_pattern": ["bar"], "layers_to_transform": [0]} + + with pytest.raises(ValueError, match="`layers_to_transform` cannot be used when `target_modules` is a str."): + LoraConfig(**invalid_config1) + + with pytest.raises(ValueError, match="`layers_pattern` cannot be used when `target_modules` is a str."): + LoraConfig(**invalid_config2) + + # should run without errors + LoraConfig(**valid_config) + + def test_ia3_is_feedforward_subset_invalid_config(self): + # This test checks that the IA3 config raises a value error if the feedforward_modules argument + # is not a subset of the target_modules argument + + # an example invalid config + invalid_config = {"target_modules": ["k", "v"], "feedforward_modules": ["q"]} + + with pytest.raises(ValueError, match="^`feedforward_modules` should be a subset of `target_modules`$"): + IA3Config(**invalid_config) + + def test_ia3_is_feedforward_subset_valid_config(self): + # This test checks that the IA3 config is created without errors with valid arguments. + # feedforward_modules should be a subset of target_modules if both are lists + + # an example valid config with regex expressions. + valid_config_regex_exp = { + "target_modules": ".*.(SelfAttention|EncDecAttention|DenseReluDense).*(q|v|wo)$", + "feedforward_modules": ".*.DenseReluDense.wo$", + } + # an example valid config with module lists. + valid_config_list = {"target_modules": ["k", "v", "wo"], "feedforward_modules": ["wo"]} + + # should run without errors + IA3Config(**valid_config_regex_exp) + IA3Config(**valid_config_list) + + def test_adalora_config_r_warning(self): + # This test checks that a warning is raised when r is set other than default in AdaLoraConfig + # No warning should be raised when initializing AdaLoraConfig with default values. + kwargs = {"peft_type": "ADALORA", "task_type": "SEQ_2_SEQ_LM", "init_r": 12, "lora_alpha": 32, "total_step": 1} + # Test that no warning is raised with default initialization + with warnings.catch_warnings(): + warnings.simplefilter("error") + try: + AdaLoraConfig(**kwargs) + except Warning: + pytest.fail("AdaLoraConfig raised a warning with default initialization.") + # Test that a warning is raised when r != 8 in AdaLoraConfig + with pytest.warns(UserWarning, match="Note that `r` is not used in AdaLora and will be ignored."): + AdaLoraConfig(r=10, total_step=1) + + def test_adalora_config_correct_timing_still_works(self): + pass + + @pytest.mark.parametrize( + "timing_kwargs", + [ + {"total_step": 100, "tinit": 0, "tfinal": 0}, + {"total_step": 100, "tinit": 10, "tfinal": 10}, + {"total_step": 100, "tinit": 79, "tfinal": 20}, + {"total_step": 100, "tinit": 80, "tfinal": 19}, + ], + ) + def test_adalora_config_valid_timing_works(self, timing_kwargs): + # Make sure that passing correct timing values is not prevented by faulty config checks. + AdaLoraConfig(**timing_kwargs) # does not raise + + def test_adalora_config_invalid_total_step_raises(self): + with pytest.raises(ValueError) as e: + AdaLoraConfig(total_step=None) + assert "AdaLoRA does not work when `total_step` is None, supply a value > 0." in str(e) + + @pytest.mark.parametrize( + "timing_kwargs", + [ + {"total_step": 100, "tinit": 20, "tfinal": 80}, + {"total_step": 100, "tinit": 80, "tfinal": 20}, + {"total_step": 10, "tinit": 20, "tfinal": 0}, + {"total_step": 10, "tinit": 0, "tfinal": 10}, + {"total_step": 10, "tinit": 10, "tfinal": 0}, + {"total_step": 10, "tinit": 20, "tfinal": 0}, + {"total_step": 10, "tinit": 20, "tfinal": 20}, + {"total_step": 10, "tinit": 0, "tfinal": 20}, + ], + ) + def test_adalora_config_timing_bounds_error(self, timing_kwargs): + # Check if the user supplied timing values that will certainly fail because it breaks + # AdaLoRA assumptions. + with pytest.raises(ValueError) as e: + AdaLoraConfig(**timing_kwargs) + + assert "The supplied schedule values don't allow for a budgeting phase" in str(e) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_pretrained_forward_compatible(self, config_class, mandatory_kwargs, tmp_path, recwarn): + """ + Make it possible to load configs that contain unknown keys by ignoring them. + + The idea is to make PEFT configs forward-compatible with future versions of the library. + """ + config = config_class(**mandatory_kwargs) + config.save_pretrained(tmp_path) + # add a spurious key to the config + with open(tmp_path / "adapter_config.json") as f: + config_dict = json.load(f) + config_dict["foobar"] = "baz" + config_dict["spam"] = 123 + with open(tmp_path / "adapter_config.json", "w") as f: + json.dump(config_dict, f) + + msg = f"Unexpected keyword arguments ['foobar', 'spam'] for class {config_class.__name__}, these are ignored." + config_from_pretrained = config_class.from_pretrained(tmp_path) + + expected_num_warnings = 1 + # TODO: remove once Bone is removed in v0.19.0 + if config_class == BoneConfig: + expected_num_warnings = 2 # Bone has 1 more warning about it being deprecated + + assert len(recwarn) == expected_num_warnings + assert recwarn.list[-1].message.args[0].startswith(msg) + assert "foo" not in config_from_pretrained.to_dict() + assert "spam" not in config_from_pretrained.to_dict() + assert config.to_dict() == config_from_pretrained.to_dict() + assert isinstance(config_from_pretrained, config_class) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_pretrained_forward_compatible_load_from_peft_config( + self, config_class, mandatory_kwargs, tmp_path, recwarn + ): + """Exact same test as before, but instead of using LoraConfig.from_pretrained, AdaLoraconfig.from_pretrained, + etc. use PeftConfig.from_pretrained. This covers a previously existing bug where only the known arguments from + PeftConfig would be used instead of the more specific config (which is known thanks to the peft_type + attribute). + + """ + config = config_class(**mandatory_kwargs) + config.save_pretrained(tmp_path) + # add a spurious key to the config + with open(tmp_path / "adapter_config.json") as f: + config_dict = json.load(f) + config_dict["foobar"] = "baz" + config_dict["spam"] = 123 + with open(tmp_path / "adapter_config.json", "w") as f: + json.dump(config_dict, f) + + msg = f"Unexpected keyword arguments ['foobar', 'spam'] for class {config_class.__name__}, these are ignored." + config_from_pretrained = PeftConfig.from_pretrained(tmp_path) # <== use PeftConfig here + + expected_num_warnings = 1 + # TODO: remove once Bone is removed in v0.19.0 + if config_class == BoneConfig: + expected_num_warnings = 2 # Bone has 1 more warning about it being deprecated + + assert len(recwarn) == expected_num_warnings + assert recwarn.list[-1].message.args[0].startswith(msg) + assert "foo" not in config_from_pretrained.to_dict() + assert "spam" not in config_from_pretrained.to_dict() + assert config.to_dict() == config_from_pretrained.to_dict() + assert isinstance(config_from_pretrained, config_class) + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_from_pretrained_sanity_check(self, config_class, mandatory_kwargs, tmp_path): + """Following up on the previous test about forward compatibility, we *don't* want any random json to be accepted as + a PEFT config. There should be a minimum set of required keys. + """ + non_peft_json = {"foo": "bar", "baz": 123} + with open(tmp_path / "adapter_config.json", "w") as f: + json.dump(non_peft_json, f) + + msg = f"The {config_class.__name__} config that is trying to be loaded is missing required keys: {{'peft_type'}}." + with pytest.raises(TypeError, match=msg): + config_class.from_pretrained(tmp_path) + + def test_lora_config_layers_to_transform_validation(self): + """Test that specifying layers_pattern without layers_to_transform raises an error""" + with pytest.raises( + ValueError, match="When `layers_pattern` is specified, `layers_to_transform` must also be specified." + ): + LoraConfig(r=8, lora_alpha=16, target_modules=["query", "value"], layers_pattern="model.layers") + + # Test that specifying both layers_to_transform and layers_pattern works fine + config = LoraConfig( + r=8, + lora_alpha=16, + target_modules=["query", "value"], + layers_to_transform=[0, 1, 2], + layers_pattern="model.layers", + ) + assert config.layers_to_transform == [0, 1, 2] + assert config.layers_pattern == "model.layers" + + # Test that not specifying either works fine + config = LoraConfig( + r=8, + lora_alpha=16, + target_modules=["query", "value"], + ) + assert config.layers_to_transform is None + assert config.layers_pattern is None + + @pytest.mark.parametrize("version", ["0.10", "0.17.0", "1"]) + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_peft_version_is_stored(self, version, config_class, mandatory_kwargs, monkeypatch, tmp_path): + # Check that the PEFT version is automatically stored in/restored from the config file. + from peft import config + + monkeypatch.setattr(config, "__version__", version) + + peft_config = config_class(**mandatory_kwargs) + assert peft_config.peft_version == version + + peft_config.save_pretrained(tmp_path) + with open(tmp_path / "adapter_config.json") as f: + config_dict = json.load(f) + assert config_dict["peft_version"] == version + + # ensure that the version from the config is being loaded, not just the current version + monkeypatch.setattr(config, "__version__", "0.1.another-version") + + # load from config + config_loaded = PeftConfig.from_pretrained(tmp_path) + assert config_loaded.peft_version == version + + # load from json + config_path = tmp_path / "adapter_config.json" + config_json = PeftConfig.from_json_file(str(config_path)) + assert config_json["peft_version"] == version + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_peft_version_is_dev_version(self, config_class, mandatory_kwargs, monkeypatch, tmp_path): + # When a dev version of PEFT is installed, the actual state of PEFT is ambiguous. Therefore, try to determine + # the commit hash too and store it as part of the version string. + from peft import config + + version = "0.15.0.dev7" + monkeypatch.setattr(config, "__version__", version) + + def fake_commit_hash(pkg_name): + return "abcdef012345" + + monkeypatch.setattr(config, "_get_commit_hash", fake_commit_hash) + + peft_config = config_class(**mandatory_kwargs) + expected_version = f"{version}@{fake_commit_hash('peft')}" + assert peft_config.peft_version == expected_version + + peft_config.save_pretrained(tmp_path) + config_loaded = PeftConfig.from_pretrained(tmp_path) + assert config_loaded.peft_version == expected_version + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_peft_version_is_dev_version_but_commit_hash_cannot_be_determined( + self, config_class, mandatory_kwargs, monkeypatch, tmp_path + ): + # There can be cases where PEFT is using a dev version but the commit hash cannot be determined. In this case, + # just store the dev version string. + from peft import config + + version = "0.15.0.dev7" + monkeypatch.setattr(config, "__version__", version) + + def fake_commit_hash(pkg_name): + return None + + monkeypatch.setattr(config, "_get_commit_hash", fake_commit_hash) + + peft_config = config_class(**mandatory_kwargs) + assert peft_config.peft_version == version + "@UNKNOWN" + + peft_config.save_pretrained(tmp_path) + config_loaded = PeftConfig.from_pretrained(tmp_path) + assert config_loaded.peft_version == version + "@UNKNOWN" + + @pytest.mark.parametrize("config_class, mandatory_kwargs", ALL_CONFIG_CLASSES) + def test_peft_version_warn_when_commit_hash_errors(self, config_class, mandatory_kwargs, monkeypatch, tmp_path): + # We try to get the PEFT commit hash if a dev version is installed. But in case there is any kind of error + # there, we don't want user code to break. Instead, the code should run and a version without commit hash should + # be recorded. In addition, there should be a warning. + from peft import config + + version = "0.15.0.dev7" + monkeypatch.setattr(config, "__version__", version) + + def fake_commit_hash_raises(pkg_name): + raise Exception("Error for testing purpose") + + monkeypatch.setattr(config, "_get_commit_hash", fake_commit_hash_raises) + + msg = "A dev version of PEFT is used but there was an error while trying to determine the commit hash" + with pytest.warns(UserWarning, match=msg): + peft_config = config_class(**mandatory_kwargs) + assert peft_config.peft_version == version + "@UNKNOWN" diff --git a/peft/tests/test_cpt.py b/peft/tests/test_cpt.py new file mode 100644 index 0000000000000000000000000000000000000000..6b747f8f41c7fd6416905f76f7bef67d8fdc7bb1 --- /dev/null +++ b/peft/tests/test_cpt.py @@ -0,0 +1,301 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Union + +import pytest +import torch +from datasets import load_dataset +from torch.utils.data import Dataset +from tqdm import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import CPTConfig, TaskType, get_peft_model + + +TEMPLATE = {"input": "input: {}", "intra_seperator": " ", "output": "output: {}", "inter_seperator": "\n"} + +MODEL_NAME = "hf-internal-testing/tiny-random-OPTForCausalLM" +MAX_INPUT_LENGTH = 1024 + + +@pytest.fixture(scope="module") +def global_tokenizer(): + """Load the tokenizer fixture for the model.""" + + return AutoTokenizer.from_pretrained(MODEL_NAME, padding_side="right") + + +@pytest.fixture(scope="module") +def config_text(): + """Load the SST2 dataset and prepare it for testing.""" + config = CPTConfig( + cpt_token_ids=[0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing + cpt_mask=[1, 1, 1, 1, 1, 1, 1, 1], + cpt_tokens_type_mask=[1, 2, 2, 2, 3, 3, 3, 4], + opt_weighted_loss_type="decay", + opt_loss_decay_factor=0.95, + opt_projection_epsilon=0.2, + opt_projection_format_epsilon=0.1, + tokenizer_name_or_path=MODEL_NAME, + ) + return config + + +@pytest.fixture(scope="module") +def config_random(): + """Load the SST2 dataset and prepare it for testing.""" + config = CPTConfig( + opt_weighted_loss_type="decay", + opt_loss_decay_factor=0.95, + opt_projection_epsilon=0.2, + opt_projection_format_epsilon=0.1, + tokenizer_name_or_path=MODEL_NAME, + ) + return config + + +@pytest.fixture(scope="module") +def sst_data(): + """Load the SST2 dataset and prepare it for testing.""" + data = load_dataset("glue", "sst2") + + def add_string_labels(example): + if example["label"] == 0: + example["label_text"] = "negative" + elif example["label"] == 1: + example["label_text"] = "positive" + return example + + train_dataset = data["train"].select(range(4)).map(add_string_labels) + test_dataset = data["validation"].select(range(10)).map(add_string_labels) + + return {"train": train_dataset, "test": test_dataset} + + +@pytest.fixture(scope="module") +def collator(global_tokenizer): + class CPTDataCollatorForLanguageModeling(DataCollatorForLanguageModeling): + def __init__(self, tokenizer, training=True, mlm=False): + super().__init__(tokenizer, mlm=mlm) + self.training = training + self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) # mk check why needed + + def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]: + # Handle dict or lists with proper padding and conversion to tensor. + list_sample_mask = [] + for i in range(len(examples)): + if "sample_mask" in examples[i].keys(): + list_sample_mask.append(examples[i].pop("sample_mask")) + + max_len = max(len(ex["input_ids"]) for ex in examples) + + def pad_sequence(sequence, max_len, pad_value=0): + return sequence + [pad_value] * (max_len - len(sequence)) + + input_ids = torch.tensor([pad_sequence(ex["input_ids"], max_len) for ex in examples]) + attention_mask = torch.tensor([pad_sequence(ex["attention_mask"], max_len) for ex in examples]) + input_type_mask = torch.tensor([pad_sequence(ex["input_type_mask"], max_len) for ex in examples]) + + batch = {"input_ids": input_ids, "attention_mask": attention_mask, "input_type_mask": input_type_mask} + + tensor_sample_mask = batch["input_ids"].clone().long() + tensor_sample_mask[:, :] = 0 + for i in range(len(list_sample_mask)): + tensor_sample_mask[i, : len(list_sample_mask[i])] = list_sample_mask[i] + + batch["labels"] = batch["input_ids"].clone() + if not self.training: + batch["sample_mask"] = tensor_sample_mask + + return batch + + collator = CPTDataCollatorForLanguageModeling(global_tokenizer, training=True, mlm=False) + return collator + + +def dataset(data, tokenizer): + class CPTDataset(Dataset): + def __init__(self, samples, tokenizer, template, max_length=MAX_INPUT_LENGTH): + self.template = template + self.tokenizer = tokenizer + self.max_length = max_length + + self.attention_mask = [] + self.input_ids = [] + self.input_type_mask = [] + self.inter_seperator_ids = self._get_input_ids(template["inter_seperator"]) + + for sample_i in tqdm(samples): + input_text, label = sample_i["sentence"], sample_i["label_text"] + input_ids, attention_mask, input_type_mask = self.preprocess_sentence(input_text, label) + + self.input_ids.append(input_ids) + self.attention_mask.append(attention_mask) + self.input_type_mask.append(input_type_mask) + + def _get_input_ids(self, text): + return self.tokenizer(text, add_special_tokens=False)["input_ids"] + + def preprocess_sentence(self, input_text, label): + input_template_part_1_text, input_template_part_2_text = self.template["input"].split("{}") + input_template_tokenized_part1 = self._get_input_ids(input_template_part_1_text) + input_tokenized = self._get_input_ids(input_text) + input_template_tokenized_part2 = self._get_input_ids(input_template_part_2_text) + + sep_tokenized = self._get_input_ids(self.template["intra_seperator"]) + + label_template_part_1, label_template_part_2 = self.template["output"].split("{}") + label_template_part1_tokenized = self._get_input_ids(label_template_part_1) + label_tokenized = self._get_input_ids(label) + label_template_part2_tokenized = self._get_input_ids(label_template_part_2) + + eos = [self.tokenizer.eos_token_id] if self.tokenizer.eos_token_id is not None else [] + input_ids = ( + input_template_tokenized_part1 + + input_tokenized + + input_template_tokenized_part2 + + sep_tokenized + + label_template_part1_tokenized + + label_tokenized + + label_template_part2_tokenized + + eos + ) + + # determine label tokens, to calculate loss only over them when labels_loss == True + attention_mask = [1] * len(input_ids) + input_type_mask = ( + [1] * len(input_template_tokenized_part1) + + [2] * len(input_tokenized) + + [1] * len(input_template_tokenized_part2) + + [0] * len(sep_tokenized) + + [3] * len(label_template_part1_tokenized) + + [4] * len(label_tokenized) + + [3] * len(label_template_part2_tokenized) + + [0] * len(eos) + ) + + assert len(input_type_mask) == len(input_ids) == len(attention_mask) + + return input_ids, attention_mask, input_type_mask + + def __len__(self): + return len(self.input_ids) + + def __getitem__(self, idx): + return { + "input_ids": self.input_ids[idx], + "attention_mask": self.attention_mask[idx], + "input_type_mask": self.input_type_mask[idx], + } + + dataset = CPTDataset(data, tokenizer, TEMPLATE) + + return dataset + + +def test_model_initialization_text(global_tokenizer, config_text): + """Test model loading and PEFT model initialization.""" + base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + + model = get_peft_model(base_model, config_text) + assert model is not None, "PEFT model initialization failed" + + +def test_model_initialization_random(global_tokenizer, config_random): + """Test model loading and PEFT model initialization.""" + base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + + model = get_peft_model(base_model, config_random) + assert model is not None, "PEFT model initialization failed" + + +def test_model_initialization_wrong_task_type_warns(): + # TODO: adjust this test to check for an error with PEFT v0.18.0 + msg = "CPTConfig only supports task_type = CAUSAL_LM, setting it automatically" + with pytest.warns(FutureWarning, match=msg): + config = CPTConfig(task_type=TaskType.SEQ_CLS) + assert config.task_type == TaskType.CAUSAL_LM + + +def test_model_training_random(sst_data, global_tokenizer, collator, config_random): + """Perform a short training run to verify the model and data integration.""" + + base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + model = get_peft_model(base_model, config_random) + emb = model.prompt_encoder.default.embedding.weight.data.clone().detach() + training_args = TrainingArguments( + output_dir="./results", + per_device_train_batch_size=1, + num_train_epochs=2, + remove_unused_columns=False, + save_strategy="no", + logging_steps=1, + ) + + train_dataset = dataset(sst_data["train"], global_tokenizer) + + trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=collator) + + trainer.train() + # Verify that the embedding tensor remains unchanged (frozen) + assert torch.all(model.prompt_encoder.default.embedding.weight.data.clone().detach().cpu() == emb.cpu()) + + delta_emb = model.prompt_encoder.default.get_projection().clone().detach() + norm_delta = delta_emb.norm(dim=1).cpu() + epsilon = model.prompt_encoder.default.get_epsilon().cpu() + # Verify that the change in tokens is constrained to epsilon + assert torch.all(norm_delta <= epsilon) + + +def test_model_batch_training_text(sst_data, global_tokenizer, collator, config_text): + """Perform a short training run to verify the model and data integration.""" + + base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) + model = get_peft_model(base_model, config_text) + emb = model.prompt_encoder.default.embedding.weight.data.clone().detach() + + training_args = TrainingArguments( + output_dir="./results", + per_device_train_batch_size=2, + num_train_epochs=2, + remove_unused_columns=False, + save_strategy="no", + logging_steps=1, + ) + + train_dataset = dataset(sst_data["train"], global_tokenizer) + + trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=collator) + + trainer.train() + # Verify that the embedding tensor remains unchanged (frozen) + assert torch.all(model.prompt_encoder.default.embedding.weight.data.clone().detach().cpu() == emb.cpu()) + + cpt_tokens_type_mask = torch.Tensor(config_text.cpt_tokens_type_mask).long() + non_label_idx = (cpt_tokens_type_mask == 1) | (cpt_tokens_type_mask == 2) | (cpt_tokens_type_mask == 3) + + delta_emb = model.prompt_encoder.default.get_projection().clone().detach() + norm_delta = delta_emb.norm(dim=1).cpu() + epsilon = model.prompt_encoder.default.get_epsilon().cpu() + # Verify that the change in tokens is constrained to epsilon + assert torch.all(norm_delta <= epsilon) + # Ensure that label tokens remain unchanged + assert torch.all((norm_delta == 0) == (~non_label_idx)) diff --git a/peft/tests/test_custom_models.py b/peft/tests/test_custom_models.py new file mode 100644 index 0000000000000000000000000000000000000000..5116919978ae3c45183fe7063b15147d3f744851 --- /dev/null +++ b/peft/tests/test_custom_models.py @@ -0,0 +1,6226 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import os +import platform +import re +import shutil +import tempfile +import time +from contextlib import contextmanager +from functools import partial + +import pytest +import torch +from safetensors.torch import load_file as safe_load_file +from torch import nn +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification +from transformers.pytorch_utils import Conv1D + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + MissConfig, + OFTConfig, + PeftModel, + PeftWarning, + RandLoraConfig, + RoadConfig, + ShiraConfig, + TaskType, + TrainableTokensConfig, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, + get_peft_model, +) +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import AuxiliaryTrainingWrapper, infer_device + +from .testing_common import PeftCommonTester +from .testing_utils import get_state_dict, require_non_cpu, set_init_weights_false + + +# MLP is a vanilla FF network with only linear layers +# EmbConv1D has an embedding and a Conv1D layer +# Conv2D has a Conv2D layer +TEST_CASES = [ + ######## + # LoRA # + ######## + ("Vanilla MLP 1 LoRA", "MLP", LoraConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 LoRA", "MLP", LoraConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 LoRA", "MLP", LoraConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 LoRA", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 LoRA", "MLP", LoraConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), + ( + "Vanilla MLP 6 LoRA", + "MLP", + LoraConfig, + { + "target_modules": ["lin0"], + "lora_alpha": 4, + "lora_dropout": 0.1, + }, + ), + ("Vanilla MLP 7 LoRA with DoRA", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_dora": True}), + ("Vanilla MLP 8 LoRA with DoRA", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_dora": True}), + ( + "Vanilla MLP 9 LoRA with DoRA", + "MLP", + LoraConfig, + {"target_modules": "lin1", "use_dora": True, "lora_alpha": 32}, + ), + ("Embedding + transformers Conv1D 1 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["conv1d"]}), + ("Embedding + transformers Conv1D 2 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb"]}), + ("Embedding + transformers Conv1D 3 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb", "conv1d"]}), + ( + "Embedding + transformers Conv1D 1 DoRA", + "EmbConv1D", + LoraConfig, + {"target_modules": ["conv1d"], "use_dora": True}, + ), + ("Embedding + transformers Conv1D 2 DoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb"], "use_dora": True}), + ( + "Embedding + transformers Conv1D 3 DoRA", + "EmbConv1D", + LoraConfig, + {"target_modules": ["emb", "conv1d"], "use_dora": True}, + ), + ( + "Embedding + transformers Conv1D 1 LoRA trainable_tokens", + "EmbConv1D", + LoraConfig, + {"target_modules": ["conv1d"], "trainable_token_indices": {"emb": [0, 10]}}, + ), + ("Conv1d LoRA", "Conv1d", LoraConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LoRA with DoRA", "Conv1d", LoraConfig, {"target_modules": ["conv1d"], "use_dora": True}), + ("Conv2d 1 LoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d"]}), + ("Conv2d 2 LoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d", "lin0"]}), + ("Conv2d 1 LoRA with DoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d"], "use_dora": True}), + ("Conv2d 2 LoRA with DoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d", "lin0"], "use_dora": True}), + ("Conv2d Groups LoRA", "Conv2dGroups", LoraConfig, {"target_modules": ["conv2d"]}), + ("Conv2d Groups2 LoRA", "Conv2dGroups2", LoraConfig, {"target_modules": ["conv2d"]}), + ("Conv2d Groups LoRA with DoRA", "Conv2dGroups", LoraConfig, {"target_modules": ["conv2d"], "use_dora": True}), + ("Conv2d Groups2 LoRA with DoRA", "Conv2dGroups2", LoraConfig, {"target_modules": ["conv2d"], "use_dora": True}), + ("Conv3d 1 LoRA", "Conv3d", LoraConfig, {"target_modules": ["conv3d"]}), + ("Conv3d 2 LoRA", "Conv3d", LoraConfig, {"target_modules": ["conv3d", "lin0"]}), + ("Conv3d 1 LoRA with DoRA", "Conv3d", LoraConfig, {"target_modules": ["conv3d"], "use_dora": True}), + ("Conv3d 2 LoRA with DoRA", "Conv3d", LoraConfig, {"target_modules": ["conv3d", "lin0"], "use_dora": True}), + # LoRA with lora_B bias enabled (note: embedding is not supported) + # It's important to set lora_alpha != r to ensure that scaling is taken into account correctly + ( + "Vanilla MLP 1 LoRA with lora_b bias", + "MLP", + LoraConfig, + {"target_modules": ["lin0", "lin1"], "lora_bias": True, "lora_alpha": 32}, + ), + ( + "Conv2d 1 LoRA with lora_b bias", + "Conv2d", + LoraConfig, + {"target_modules": ["conv2d"], "lora_bias": True, "lora_alpha": 32}, + ), + ( + "Conv3d 1 LoRA with lora_b bias", + "Conv3d", + LoraConfig, + {"target_modules": ["conv3d"], "lora_bias": True, "lora_alpha": 32}, + ), + ("MHA 1 LoRA", "MHA", LoraConfig, {"target_modules": ["mha"]}), + ("MHA 2 LoRA", "MHA", LoraConfig, {"target_modules": ["mha", "lin0"]}), + # targeting parameters directly + ("MLP 1 using nn.Parameter LoRA", "MlpUsingParameters", LoraConfig, {"target_parameters": ["lin0.weight"]}), + ( + "MLP 2 using nn.Parameter LoRA", + "MLP", + LoraConfig, + {"target_modules": ["lin0"], "target_parameters": ["lin1.weight"]}, + ), + ####### + # IA³ # + ####### + ("Vanilla MLP 1 IA3", "MLP", IA3Config, {"target_modules": "lin0", "feedforward_modules": []}), + ("Vanilla MLP 2 IA3", "MLP", IA3Config, {"target_modules": "lin0", "feedforward_modules": "lin0"}), + ("Vanilla MLP 3 IA3", "MLP", IA3Config, {"target_modules": ["lin0"], "feedforward_modules": []}), + ("Vanilla MLP 4 IA3", "MLP", IA3Config, {"target_modules": ["lin0"], "feedforward_modules": ["lin0"]}), + ("Vanilla MLP 5 IA3", "MLP", IA3Config, {"target_modules": ["lin1"], "feedforward_modules": []}), + ("Vanilla MLP 6 IA3", "MLP", IA3Config, {"target_modules": ["lin1"], "feedforward_modules": ["lin1"]}), + ( + "Vanilla MLP 7 IA3", + "MLP", + IA3Config, + {"target_modules": ["lin0", "lin1"], "feedforward_modules": []}, + ), + ( + "Vanilla MLP 8 IA3", + "MLP", + IA3Config, + {"target_modules": ["lin0", "lin1"], "feedforward_modules": ["lin0", "lin1"]}, + ), + ( + "Vanilla MLP 9 IA3", + "MLP", + IA3Config, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "feedforward_modules": ["lin0"]}, + ), + ( + "transformers Conv1D 1 IA3", + "EmbConv1D", + IA3Config, + {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"]}, + ), + ( + "transformers Conv1D 2 IA3", + "EmbConv1D", + IA3Config, + {"target_modules": ["conv1d", "lin0"], "feedforward_modules": ["conv1d", "lin0"]}, + ), + ( + "transformers Conv1D 1 IA3", + "EmbConv1D", + IA3Config, + {"target_modules": ["conv1d"], "feedforward_modules": ["conv1d"], "modules_to_save": ["lin0"]}, + ), + ("Conv2d 1 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": []}), + ("Conv2d 2 IA3", "Conv2d", IA3Config, {"target_modules": ["conv2d"], "feedforward_modules": ["conv2d"]}), + ( + "Conv2d 3 IA3", + "Conv2d", + IA3Config, + {"target_modules": ["conv2d", "lin0"], "feedforward_modules": []}, + ), + ( + "Conv2d 4 IA3", + "Conv2d", + IA3Config, + {"target_modules": ["conv2d", "lin0"], "feedforward_modules": ["conv2d"]}, + ), + ( + "Conv2d 5 IA3", + "Conv2d", + IA3Config, + {"target_modules": ["conv2d", "lin0"], "feedforward_modules": ["conv2d", "lin0"]}, + ), + ("Conv3d 1 IA3", "Conv3d", IA3Config, {"target_modules": ["conv3d"], "feedforward_modules": []}), + ("Conv3d 2 IA3", "Conv3d", IA3Config, {"target_modules": ["conv3d"], "feedforward_modules": ["conv3d"]}), + ( + "Conv3d 3 IA3", + "Conv3d", + IA3Config, + {"target_modules": ["conv3d", "lin0"], "feedforward_modules": []}, + ), + ( + "Conv3d 4 IA3", + "Conv3d", + IA3Config, + {"target_modules": ["conv3d", "lin0"], "feedforward_modules": ["conv3d"]}, + ), + ( + "Conv3d 5 IA3", + "Conv3d", + IA3Config, + {"target_modules": ["conv3d", "lin0"], "feedforward_modules": ["conv3d", "lin0"]}, + ), + ######## + # LoHa # + ######## + ("Vanilla MLP 1 LOHA", "MLP", LoHaConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 LOHA", "MLP", LoHaConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 LOHA", "MLP", LoHaConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 LOHA", "MLP", LoHaConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 LOHA", "MLP", LoHaConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), + ( + "Vanilla MLP 6 LOHA", + "MLP", + LoHaConfig, + { + "target_modules": ["lin0"], + "alpha": 4, + "module_dropout": 0.1, + }, + ), + ("Vanilla MLP 7 LOHA", "MLP", LoHaConfig, {"target_modules": "lin0", "rank_dropout": 0.5}), + ("Conv2d 1 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"]}), + ("Conv2d 2 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"]}), + ("Conv2d 3 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), + ("Conv2d 4 LOHA", "Conv2d", LoHaConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), + ("Conv1d LOHA", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOHA 1", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOHA 2", "Conv1d", LoHaConfig, {"target_modules": ["conv1d"], "r": 2}), + ( + "Conv1d LOHA 3", + "Conv1dBigger", + LoHaConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": True}, + ), + ( + "Conv1d LOHA 4", + "Conv1dBigger", + LoHaConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False}, + ), + ("Conv2d 1x1 LOHA", "Conv2d1x1", LoHaConfig, {"target_modules": ["conv2d"]}), + # LoKr + ("Vanilla MLP 1 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 LOKR", "MLP", LoKrConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), + ( + "Vanilla MLP 6 LOKR", + "MLP", + LoKrConfig, + { + "target_modules": ["lin0"], + "alpha": 4, + "module_dropout": 0.1, + }, + ), + ("Vanilla MLP 7 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "rank_dropout": 0.5}), + ("Vanilla MLP 8 LOKR", "MLP", LoKrConfig, {"target_modules": "lin0", "decompose_both": True, "r": 1, "alpha": 1}), + ("Conv1d LOKR 1", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"]}), + ("Conv1d LOKR 2", "Conv1d", LoKrConfig, {"target_modules": ["conv1d"], "r": 2}), + ( + "Conv1d LOKR 3", + "Conv1dBigger", + LoKrConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": True}, + ), + ( + "Conv1d LOKR 4", + "Conv1dBigger", + LoKrConfig, + {"target_modules": ["conv1d"], "r": 2, "use_effective_conv2d": False}, + ), + ("Conv2d 1 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"]}), + ("Conv2d 2 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"]}), + ("Conv2d 3 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d"], "use_effective_conv2d": True}), + ("Conv2d 4 LOKR", "Conv2d", LoKrConfig, {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True}), + ("Conv2d 1x1 LOKR", "Conv2d1x1", LoKrConfig, {"target_modules": ["conv2d"]}), + ( + "Conv2d 5 LOKR", + "Conv2d", + LoKrConfig, + {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True, "decompose_both": True}, + ), + ( + "Conv2d 6 LOKR", + "Conv2d", + LoKrConfig, + {"target_modules": ["conv2d", "lin0"], "use_effective_conv2d": True, "decompose_factor": 4}, + ), + ( + "Conv2d 7 LOKR", + "Conv2d", + LoKrConfig, + { + "target_modules": ["conv2d", "lin0"], + "use_effective_conv2d": True, + "decompose_both": True, + "decompose_factor": 4, + }, + ), + ######## + # OFT # + ######## + ( + "Vanilla MLP 1 OFT", + "MLP", + OFTConfig, + {"r": 2, "oft_block_size": 0, "target_modules": "lin0", "use_cayley_neumann": False}, + ), + ( + "Vanilla MLP 2 OFT", + "MLP", + OFTConfig, + {"r": 2, "oft_block_size": 0, "target_modules": ["lin0"], "use_cayley_neumann": False}, + ), + ( + "Vanilla MLP 5 OFT", + "MLP", + OFTConfig, + { + "r": 2, + "oft_block_size": 0, + "target_modules": ["lin0"], + "modules_to_save": ["lin1"], + "use_cayley_neumann": False, + }, + ), + ( + "Vanilla MLP 6 OFT", + "MLP", + OFTConfig, + { + "r": 2, + "oft_block_size": 0, + "target_modules": ["lin0"], + "module_dropout": 0.1, + "use_cayley_neumann": False, + }, + ), + ( + "Vanilla MLP 7 OFT", + "MLP", + OFTConfig, + {"r": 2, "oft_block_size": 0, "target_modules": ["lin0"], "coft": True, "eps": 1e-2}, + ), + ( + "Vanilla MLP 8 OFT", + "MLP", + OFTConfig, + {"r": 2, "oft_block_size": 0, "target_modules": ["lin0"], "block_share": True, "use_cayley_neumann": False}, + ), + ( + "Vanilla MLP 9 OFT", + "MLP", + OFTConfig, + {"r": 2, "oft_block_size": 0, "target_modules": ["lin0"], "coft": True, "eps": 1e-2, "block_share": True}, + ), + ( + "Vanilla MLP 10 OFT", + "MLP", + OFTConfig, + {"r": 0, "oft_block_size": 2, "target_modules": ["lin0"], "use_cayley_neumann": True}, + ), + ( + "Vanilla MLP 11 OFT", + "MLP", + OFTConfig, + {"r": 0, "oft_block_size": 2, "target_modules": ["lin0"], "use_cayley_neumann": False}, + ), + ( + "Vanilla MLP 12 OFT", + "MLP", + OFTConfig, + { + "r": 0, + "oft_block_size": 2, + "target_modules": ["lin0"], + "coft": True, + "eps": 1e-2, + "block_share": True, + "use_cayley_neumann": True, + }, + ), + ( + "Vanilla MLP 13 OFT", + "MLP", + OFTConfig, + { + "r": 0, + "oft_block_size": 2, + "target_modules": ["lin0"], + "coft": True, + "eps": 1e-2, + "block_share": True, + "use_cayley_neumann": False, + }, + ), + ("Conv2d 1 OFT", "Conv2d", OFTConfig, {"r": 5, "oft_block_size": 0, "target_modules": ["conv2d"]}), + ("Conv2d 3 OFT", "Conv2d", OFTConfig, {"r": 5, "oft_block_size": 0, "target_modules": ["conv2d"], "coft": True}), + ( + "Conv2d 4 OFT", + "Conv2d", + OFTConfig, + {"r": 5, "oft_block_size": 0, "target_modules": ["conv2d"], "block_share": True}, + ), + ( + "Conv2d 5 OFT", + "Conv2d", + OFTConfig, + {"r": 5, "oft_block_size": 0, "target_modules": ["conv2d"], "coft": True, "block_share": True}, + ), + ######## + # HRA # + ######## + ("Vanilla MLP 1 HRA", "MLP", HRAConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 HRA", "MLP", HRAConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 HRA", "MLP", HRAConfig, {"target_modules": ["lin0", "lin1"]}), + ("Vanilla MLP 5 HRA", "MLP", HRAConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}), + ("Conv2d 1 HRA", "Conv2d", HRAConfig, {"target_modules": ["conv2d"]}), + ######## + # Bone # + ######## + ("Vanilla MLP 1 Bone", "MLP", BoneConfig, {"target_modules": "lin0", "r": 2}), + ("Vanilla MLP 2 Bone", "MLP", BoneConfig, {"target_modules": ["lin0"], "r": 2}), + ("Vanilla MLP 3 Bone", "MLP", BoneConfig, {"target_modules": ["lin0", "lin1"], "r": 2}), + ("Vanilla MLP 5 Bone", "MLP", BoneConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "r": 2}), + ("Vanilla MLP 1 Bone", "MLP", BoneConfig, {"target_modules": "lin0", "r": 2, "init_weights": "bat"}), + ("Vanilla MLP 2 Bone", "MLP", BoneConfig, {"target_modules": ["lin0"], "r": 2, "init_weights": "bat"}), + ("Vanilla MLP 3 Bone", "MLP", BoneConfig, {"target_modules": ["lin0", "lin1"], "r": 2, "init_weights": "bat"}), + ( + "Vanilla MLP 5 Bone", + "MLP", + BoneConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "r": 2, "init_weights": "bat"}, + ), + ######## + # MiSS # + ######## + ("Vanilla MLP 1 MiSS", "MLP", MissConfig, {"target_modules": "lin0", "r": 2}), + ("Vanilla MLP 2 MiSS", "MLP", MissConfig, {"target_modules": ["lin0"], "r": 2}), + ("Vanilla MLP 3 MiSS", "MLP", MissConfig, {"target_modules": ["lin0", "lin1"], "r": 2}), + ("Vanilla MLP 5 MiSS", "MLP", MissConfig, {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "r": 2}), + ("Vanilla MLP 1 MiSS", "MLP", MissConfig, {"target_modules": "lin0", "r": 2, "init_weights": "bat"}), + ("Vanilla MLP 2 MiSS", "MLP", MissConfig, {"target_modules": ["lin0"], "r": 2, "init_weights": "bat"}), + ("Vanilla MLP 3 MiSS", "MLP", MissConfig, {"target_modules": ["lin0", "lin1"], "r": 2, "init_weights": "bat"}), + ( + "Vanilla MLP 5 MiSS", + "MLP", + MissConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "r": 2, "init_weights": "bat"}, + ), + ############# + # LN Tuning # + ############# + ("LayerNorm 1 LNTuning", "MLP_LayerNorm", LNTuningConfig, {"target_modules": "layernorm0"}), + ("LayerNorm 2 LNTuning", "MLP_LayerNorm", LNTuningConfig, {"target_modules": ["layernorm0"]}), + ( + "LayerNorm 3 LNTuning", + "MLP_LayerNorm", + LNTuningConfig, + {"target_modules": ["layernorm0"], "modules_to_save": ["layernorm1"]}, + ), + ("Linear 4 LNTuning", "MLP_LayerNorm", LNTuningConfig, {"target_modules": "lin0"}), + ("Linear 5 LNTuning", "MLP_LayerNorm", LNTuningConfig, {"target_modules": ["lin0"]}), + ######## + # BOFT # + ######## + ("Vanilla MLP 1 BOFT", "MLP", BOFTConfig, {"target_modules": ["lin1"], "boft_block_size": 2}), + ( + "Vanilla MLP 2 BOFT", + "MLP", + BOFTConfig, + {"target_modules": ["lin1"], "modules_to_save": ["lin0"], "boft_block_size": 2}, + ), + ( + "Vanilla MLP 3 BOFT", + "MLP", + BOFTConfig, + { + "target_modules": ["lin1"], + "boft_block_size": 2, + "boft_dropout": 0.1, + }, + ), + ( + "Vanilla MLP 4 BOFT", + "MLP", + BOFTConfig, + {"target_modules": ["lin1"], "boft_block_size": 2, "boft_block_num": 0, "boft_n_butterfly_factor": 1}, + ), + ( + "Vanilla MLP 5 BOFT", + "MLP", + BOFTConfig, + {"target_modules": ["lin1"], "boft_block_size": 0, "boft_block_num": 2, "boft_n_butterfly_factor": 1}, + ), + ( + "Vanilla MLP 6 BOFT", + "MLP", + BOFTConfig, + {"target_modules": ["lin1"], "boft_block_size": 10, "boft_block_num": 0, "boft_n_butterfly_factor": 2}, + ), + ( + "Conv2d 1 BOFT", + "Conv2d", + BOFTConfig, + {"target_modules": ["conv2d"], "boft_block_size": 45, "boft_block_num": 0, "boft_n_butterfly_factor": 1}, + ), + ( + "Conv2d 2 BOFT", + "Conv2d", + BOFTConfig, + {"target_modules": ["conv2d"], "boft_block_size": 0, "boft_block_num": 1, "boft_n_butterfly_factor": 1}, + ), + ( + "MLP2 1 BOFT", + "MLP2", + BOFTConfig, + {"target_modules": ["lin1"], "boft_block_size": 2, "boft_block_num": 0, "boft_n_butterfly_factor": 3}, + ), + ( + "MLP2 2 BOFT", + "MLP2", + BOFTConfig, + {"target_modules": ["lin1"], "boft_block_size": 0, "boft_block_num": 8, "boft_n_butterfly_factor": 3}, + ), + ( + "Conv2d2 1 BOFT", + "Conv2d2", + BOFTConfig, + {"target_modules": ["conv2d"], "boft_block_size": 2, "boft_block_num": 0, "boft_n_butterfly_factor": 2}, + ), + ( + "Conv2d2 1 BOFT", + "Conv2d2", + BOFTConfig, + {"target_modules": ["conv2d"], "boft_block_size": 2, "boft_block_num": 0, "boft_n_butterfly_factor": 3}, + ), + ######### + # SHiRA # + ######### + ("Vanilla MLP 1 SHiRA", "MLP", ShiraConfig, {"r": 1, "target_modules": "lin0"}), + ("Vanilla MLP 2 SHiRA", "MLP", ShiraConfig, {"r": 1, "target_modules": ["lin0"]}), + ("Vanilla MLP 3 SHiRA", "MLP", ShiraConfig, {"r": 1, "target_modules": ["lin1"]}), + ( + "Vanilla MLP 4 SHiRA", + "MLP", + ShiraConfig, + {"r": 1, "target_modules": ["lin0", "lin1"], "random_seed": 56}, + ), + ( + "Vanilla MLP 5 SHiRA", + "MLP", + ShiraConfig, + {"r": 1, "target_modules": ["lin0"]}, + ), + ######## + # VeRA # + ######## + ("Vanilla MLP 1 VeRA", "MLP", VeraConfig, {"target_modules": "lin0"}), + ("Vanilla MLP 2 VeRA", "MLP", VeraConfig, {"target_modules": ["lin0"]}), + ("Vanilla MLP 3 VeRA", "MLP", VeraConfig, {"target_modules": ["lin1"]}), + ("Vanilla MLP 4 VeRA", "MLP", VeraConfig, {"target_modules": ["lin0", "lin1"]}), + ( + "Vanilla MLP 5 VeRA", + "MLP", + VeraConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"]}, + ), + ( + "Embedding + transformers Conv1D 1 VeRA", + "EmbConv1D", + VeraConfig, + {"target_modules": ["conv1d"]}, + ), + ############# + # FourierFT # + ############# + # FourierFT is not initialized as an identity transform by default, hence set init_weights=True + ( + "Vanilla MLP 1 FourierFT", + "MLP", + FourierFTConfig, + {"n_frequency": 10, "target_modules": "lin0", "init_weights": True}, + ), + ( + "Vanilla MLP 2 FourierFT", + "MLP", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin0"], "init_weights": True}, + ), + ( + "Vanilla MLP 3 FourierFT", + "MLP", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin1"], "init_weights": True}, + ), + ( + "Vanilla MLP 5 FourierFT", + "MLP", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin0"], "modules_to_save": ["lin1"], "init_weights": True}, + ), + ( + "Vanilla MLP 6 FourierFT", + "MLP", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"], "init_weights": True}, + ), + ( + "Vanilla MLP 7 FourierFT", + "MLP", + FourierFTConfig, + { + "n_frequency_pattern": {"lin0": 5, "lin1": 10}, + "target_modules": ["lin0", "lin1"], + "modules_to_save": ["lin1"], + "init_weights": True, + }, + ), + ########## + # VBLoRA # + ########## + ("Vanilla MLP 1 VBLoRA", "MLP", VBLoRAConfig, {"target_modules": "lin0", "vector_length": 1, "num_vectors": 5}), + ("Vanilla MLP 2 VBLoRA", "MLP", VBLoRAConfig, {"target_modules": ["lin0"], "vector_length": 1, "num_vectors": 5}), + ("Vanilla MLP 3 VBLoRA", "MLP", VBLoRAConfig, {"target_modules": ["lin1"], "vector_length": 2, "num_vectors": 5}), + ( + "Vanilla MLP 4 VBLoRA", + "MLP", + VBLoRAConfig, + {"target_modules": ["lin0", "lin1"], "vector_length": 1, "num_vectors": 5}, + ), + ( + "Vanilla MLP 5 VBLoRA", + "MLP", + VBLoRAConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "vector_length": 1, "num_vectors": 5}, + ), + ( + "Embedding + transformers Conv1D 1 VBLoRA", + "EmbConv1D", + VBLoRAConfig, + {"target_modules": ["conv1d"], "vector_length": 1, "num_vectors": 2}, + ), + ################### + # TrainableTokens # + ################### + ( + "Embedding + transformers Conv1D 1 trainable_tokens", + "EmbConv1D", + TrainableTokensConfig, + {"target_modules": ["emb"], "token_indices": [0, 1, 3], "init_weights": False}, + ), + ############ + # RandLora # + ############ + # We have to reduce the default scaling parameter to avoid nans when using large learning rates + ("Vanilla MLP 1 RandLora", "MLP", RandLoraConfig, {"target_modules": "lin0", "randlora_alpha": 1}), + ("Vanilla MLP 2 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0"], "randlora_alpha": 1}), + ("Vanilla MLP 3 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin1"], "randlora_alpha": 1}), + ("Vanilla MLP 4 RandLora", "MLP", RandLoraConfig, {"target_modules": ["lin0", "lin1"], "randlora_alpha": 1}), + ( + "Vanilla MLP 5 RandLora", + "MLP", + RandLoraConfig, + {"target_modules": ["lin0", "lin1"], "sparse": True, "randlora_alpha": 1}, + ), + ( + "Vanilla MLP 6 RandLora", + "MLP", + RandLoraConfig, + {"target_modules": ["lin0", "lin1"], "very_sparse": True, "randlora_alpha": 1}, + ), + ( + "Vanilla MLP 7 RandLora", + "MLP", + RandLoraConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "randlora_alpha": 1}, + ), + ####### + # C3A # + ####### + # note: C3A is not initialized as an identity transform by default, hence set init_weights=True + ("Vanilla MLP 1 C3A", "MLP", C3AConfig, {"block_size": 2, "target_modules": "lin0", "init_weights": True}), + ("Vanilla MLP 2 C3A", "MLP", C3AConfig, {"block_size": 2, "target_modules": ["lin0"], "init_weights": True}), + ("Vanilla MLP 3 C3A", "MLP", C3AConfig, {"block_size": 2, "target_modules": ["lin1"], "init_weights": True}), + ( + "Vanilla MLP 5 C3A", + "MLP", + C3AConfig, + {"block_size": 10, "target_modules": ["lin0"], "modules_to_save": ["lin1"], "init_weights": True}, + ), + ( + "Vanilla MLP 6 C3A", + "MLP", + C3AConfig, + {"block_size": 10, "target_modules": ["lin0", "lin1"], "modules_to_save": ["lin1"], "init_weights": True}, + ), + ( + "Vanilla MLP 7 C3A", + "MLP", + C3AConfig, + { + "block_size_pattern": {"lin0": 5, "lin1": 10}, + "target_modules": ["lin0", "lin1"], + "modules_to_save": ["lin1"], + "init_weights": True, + }, + ), + ########## + # WaveFT # + ########## + ("Vanilla MLP 1 WaveFT", "MLP", WaveFTConfig, {"target_modules": "lin0", "n_frequency": 8}), + ("Vanilla MLP 2 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin0"], "n_frequency": 8}), + ("Vanilla MLP 3 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin1"], "n_frequency": 8}), + ("Vanilla MLP 4 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin0", "lin1"], "n_frequency": 8}), + ( + "Vanilla MLP 5 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "n_frequency": 8}, + ), + ( + "Vanilla MLP 6 WaveFT", + "MLP", + WaveFTConfig, + { + "target_modules": ["lin0"], + "n_frequency": 8, + "scaling": 25.0, + "wavelet_family": "db1", + }, + ), + ("Vanilla MLP 7 WaveFT", "MLP", WaveFTConfig, {"target_modules": "lin1", "n_frequency": 8, "use_idwt": False}), + ( + "Vanilla MLP 8 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "sym2"}, + ), + ( + "Vanilla MLP 9 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "sym2", "use_idwt": False}, + ), + ( + "Vanilla MLP 10 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "db1", "proportional_parameters": True}, + ), + ######## + # RoAd # + ######## + ("Vanilla MLP 1 RoAd", "MLP", RoadConfig, {"target_modules": "lin0", "group_size": 2}), + ("Vanilla MLP 2 RoAd", "MLP", RoadConfig, {"target_modules": ["lin0"], "group_size": 2}), + ("Vanilla MLP 3 RoAd", "MLP", RoadConfig, {"target_modules": ["lin1"], "group_size": 2}), + ("Vanilla MLP 4 RoAd", "MLP", RoadConfig, {"target_modules": ["lin0", "lin1"], "group_size": 2}), + ("Vanilla MLP 5 RoAd", "MLP", RoadConfig, {"target_modules": ["lin0"], "variant": "road_2", "group_size": 2}), + ("Vanilla MLP 6 RoAd", "MLP", RoadConfig, {"target_modules": ["lin0"], "variant": "road_4", "group_size": 2}), + ########## + # WaveFT # + ########## + ("Vanilla MLP 1 WaveFT", "MLP", WaveFTConfig, {"target_modules": "lin0", "n_frequency": 8}), + ("Vanilla MLP 2 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin0"], "n_frequency": 8}), + ("Vanilla MLP 3 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin1"], "n_frequency": 8}), + ("Vanilla MLP 4 WaveFT", "MLP", WaveFTConfig, {"target_modules": ["lin0", "lin1"], "n_frequency": 8}), + ( + "Vanilla MLP 5 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "n_frequency": 8}, + ), + ( + "Vanilla MLP 6 WaveFT", + "MLP", + WaveFTConfig, + { + "target_modules": ["lin0"], + "n_frequency": 8, + "scaling": 25.0, + "wavelet_family": "db1", + }, + ), + ("Vanilla MLP 7 WaveFT", "MLP", WaveFTConfig, {"target_modules": "lin1", "n_frequency": 8, "use_idwt": False}), + ( + "Vanilla MLP 8 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "sym2"}, + ), + ( + "Vanilla MLP 9 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "sym2", "use_idwt": False}, + ), + ( + "Vanilla MLP 10 WaveFT", + "MLP", + WaveFTConfig, + {"target_modules": "lin0", "n_frequency": 16, "wavelet_family": "db1", "proportional_parameters": True}, + ), +] +ALL_PEFT_CONFIG_CLASSES = sorted({row[2] for row in TEST_CASES}, key=lambda cls: cls.__name__) + +# For this test matrix, each tuple consists of: +# - test name +# - tuner method +# - config_cls +# - 1st config kwargs +# - 2nd config kwargs +# The model used for this test is `MLP`, which uses linear layers `lin0` and `lin1` +MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES = [ + ( + "LoRA Same", + "lora", + LoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False}, + {"target_modules": ["lin0"], "init_lora_weights": False}, + ), + ( + "LoRA Different", + "lora", + LoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False}, + {"target_modules": ["lin1"], "init_lora_weights": False}, + ), + ( + "LoRA + trainable tokens Same", + "lora+trainable_tokens", + LoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "trainable_token_indices": {"emb": [0, 1, 2]}}, + {"target_modules": ["lin0"], "init_lora_weights": False, "trainable_token_indices": {"emb": [3, 4, 5, 6]}}, + ), + ( + "LoRA + trainable tokens Different", + "lora+trainable_tokens", + LoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "trainable_token_indices": {"emb": [0, 1, 2]}}, + {"target_modules": ["lin1"], "init_lora_weights": False, "trainable_token_indices": {"emb": [3, 4, 5, 6]}}, + ), + ( + "LoRA targeting nn.Parameter Same", + "lora", + LoraConfig, + {"target_parameters": ["lin0.weight"], "init_lora_weights": False}, + {"target_parameters": ["lin0.weight"], "init_lora_weights": False}, + ), + ( + "LoRA targeting nn.Parameter Different", + "lora", + LoraConfig, + {"target_parameters": ["lin0.weight"], "init_lora_weights": False}, + {"target_parameters": ["lin1.weight"], "init_lora_weights": False}, + ), + ( + "IA3 Same", + "ia3", + IA3Config, + { + "target_modules": ["lin0"], + "feedforward_modules": ["lin0"], + "init_ia3_weights": False, + }, + { + "target_modules": ["lin0"], + "feedforward_modules": ["lin0"], + "init_ia3_weights": False, + }, + ), + ( + "IA3 Different", + "ia3", + IA3Config, + { + "target_modules": ["lin0"], + "feedforward_modules": ["lin0"], + "init_ia3_weights": False, + }, + { + "target_modules": ["lin1"], + "feedforward_modules": ["lin1"], + "init_ia3_weights": False, + }, + ), + ( + "AdaLora Same", + "adalora", + AdaLoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, + {"target_modules": ["lin0"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, + ), + ( + "AdaLora Different", + "adalora", + AdaLoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, + {"target_modules": ["lin1"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, + ), + ( + "FourierFT Same", + "fourierft", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin0"]}, + {"n_frequency": 10, "target_modules": ["lin0"]}, + ), + ( + "FourierFT Different", + "fourierft", + FourierFTConfig, + {"n_frequency": 10, "target_modules": ["lin0"]}, + {"n_frequency": 10, "target_modules": ["lin1"]}, + ), + ( + "SHiRA Same", + "shira", + ShiraConfig, + {"r": 1, "target_modules": ["lin0"], "init_weights": False}, + {"r": 1, "target_modules": ["lin0"], "init_weights": False}, + ), + ( + "SHiRA Different", + "shira", + ShiraConfig, + {"r": 1, "target_modules": ["lin0"], "init_weights": False}, + {"r": 1, "target_modules": ["lin1"], "init_weights": False}, + ), + # Note: Currently, we cannot target lin0 and lin1 with different adapters when using VeRA. The reason is that the + # first adapter being created will result in a vera_A or vera_B shape that is too small for the next adapter + # (remember that VeRA shares these parameters across all layers), which results in an error. + ( + "VeRA Same", + "vera", + VeraConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), + # Note: RandLora may present the same problem mentioned above for Vera. + ( + "RandLora Same", + "randlora", + RandLoraConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), + ( + "HRA Same", + "hra", + HRAConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin0"], "init_weights": False}, + ), + ( + "HRA Different", + "hra", + HRAConfig, + {"target_modules": ["lin0"], "init_weights": False}, + {"target_modules": ["lin1"], "init_weights": False}, + ), + ( + "Bone Same", + "bone", + BoneConfig, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + ), + ( + "Bone Different", + "bone", + BoneConfig, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + {"target_modules": ["lin1"], "init_weights": False, "r": 2}, + ), + ( + "MiSS Same", + "miss", + MissConfig, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + ), + ( + "MiSS Different", + "miss", + MissConfig, + {"target_modules": ["lin0"], "init_weights": False, "r": 2}, + {"target_modules": ["lin1"], "init_weights": False, "r": 2}, + ), + # Not testing "mini" initialization targeting the same layer, because The matrix is initialized to all zeros in MiSS-mini mode. + ( + "VBLoRA Same", + "vblora", + VBLoRAConfig, + {"target_modules": ["lin0"], "vector_length": 2, "init_vector_bank_bound": 0.1}, + {"target_modules": ["lin0"], "vector_length": 2, "init_vector_bank_bound": 0.1}, + ), + ( + "VBLoRA Different", + "vblora", + VBLoRAConfig, + {"target_modules": ["lin0"], "vector_length": 2, "init_vector_bank_bound": 0.1}, + {"target_modules": ["lin1"], "vector_length": 2, "init_vector_bank_bound": 0.1}, + ), + ( + "BOFT Same", + "boft", + BOFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "boft_block_size": 2}, + {"target_modules": ["lin0"], "init_weights": False, "boft_block_size": 2}, + ), + ( + "BOFT Different", + "boft", + BOFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "boft_block_size": 2}, + {"target_modules": ["lin1"], "init_weights": False, "boft_block_size": 2}, + ), + ( + "WaveFT Same", + "waveft", + WaveFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + ), + ( + "WaveFT Different", + "waveft", + WaveFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + {"target_modules": ["lin1"], "init_weights": False, "n_frequency": 8}, + ), + ( + "RoAd Same", + "road", + RoadConfig, + {"target_modules": ["lin0"], "init_weights": False, "group_size": 2}, + {"target_modules": ["lin0"], "init_weights": False, "group_size": 2}, + ), + ( + "RoAd Different", + "road", + RoadConfig, + {"target_modules": ["lin0"], "init_weights": False, "group_size": 2}, + {"target_modules": ["lin1"], "init_weights": False, "group_size": 2}, + ), + ( + "RoAd 2 Different", + "road", + RoadConfig, + {"target_modules": ["lin0"], "init_weights": False, "variant": "road_1", "group_size": 2}, + {"target_modules": ["lin1"], "init_weights": False, "variant": "road_2", "group_size": 2}, + ), + ( + "RoAd 4 Different", + "road", + RoadConfig, + {"target_modules": ["lin0"], "init_weights": False, "variant": "road_1", "group_size": 2}, + {"target_modules": ["lin1"], "init_weights": False, "variant": "road_4", "group_size": 2}, + ), + ( + "WaveFT Same", + "waveft", + WaveFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + ), + ( + "WaveFT Different", + "waveft", + WaveFTConfig, + {"target_modules": ["lin0"], "init_weights": False, "n_frequency": 8}, + {"target_modules": ["lin1"], "init_weights": False, "n_frequency": 8}, + ), +] + +PREFIXES = { + IA3Config: "ia3_", + LoraConfig: "lora_", + LoHaConfig: "hada_", + LoKrConfig: "lokr_", + OFTConfig: "oft_", + BOFTConfig: "boft_", + LNTuningConfig: "ln_tuning_", + VeraConfig: "vera_lambda_", + RandLoraConfig: "randlora_", + FourierFTConfig: "fourierft_", + C3AConfig: "c3a_", + HRAConfig: "hra_", + ShiraConfig: "shira_", + VBLoRAConfig: "vblora_", + BoneConfig: "bone_", + RoadConfig: "road_", + MissConfig: "miss_", + TrainableTokensConfig: "trainable_tokens_", + WaveFTConfig: "waveft_", +} + + +def _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs): + if (config_cls == LoraConfig) and config_kwargs.get("target_parameters"): + pytest.skip("LoRA with multiple adapters with target_parameters is not supported") + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class MLPWithGRU(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.gru = nn.GRU(input_size=20, hidden_size=20, num_layers=1, batch_first=True, bias=bias) + self.fc = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = X.unsqueeze(1) + X, _ = self.gru(X) + X = X.squeeze(1) + X = self.fc(X) + X = self.sm(X) + return X + + +class MLP_LayerNorm(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.layernorm0 = nn.LayerNorm(10, 10) + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.layernorm1 = nn.LayerNorm(20, 20) + self.lin1 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.layernorm0(X) + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = self.layernorm1(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class MLP2(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 32, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = nn.Linear(32, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class Block(nn.Module): + def __init__(self, bias=True, size=10): + super().__init__() + self.lin0 = nn.Linear(size, size, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = nn.Linear(size, size, bias=bias) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = self.lin1(X) + return X + + +class DeepMLP(nn.Module): + def __init__(self, bias=True, num_hidden_layers=12, size=10): + super().__init__() + self.layers = nn.ModuleList([Block(bias=bias, size=size) for _ in range(num_hidden_layers)]) + self.out = nn.Linear(10, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float(X) + for layer in self.layers: + X = layer(X) + X = self.out(X) + X = self.sm(X) + return X + + +class ModelEmbConv1D(nn.Module): + def __init__(self, emb_size=100): + super().__init__() + self.emb = nn.Embedding(emb_size, 5) + self.conv1d = Conv1D(1, 5) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.emb(X) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelEmbWithEmbeddingUtils(nn.Module): + # Adds `get_input_embeddings` and `get_output_embeddings` methods to mimic 🤗 transformers models + def __init__(self): + super().__init__() + self.embed_tokens = nn.Embedding(100, 5) + self.conv1d = Conv1D(1, 5) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.embed_tokens(X) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + def get_input_embeddings(self): + return self.embed_tokens + + def get_output_embeddings(self): + return None + + +class ModelConv1D(nn.Module): + def __init__(self): + super().__init__() + self.conv1d = nn.Conv1d(1, 1, 2) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(9, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 1, 10) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelConv1DBigger(nn.Module): + def __init__(self): + super().__init__() + self.conv1d = nn.Conv1d(64, 16, 2) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(144, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 1, 10) + X = torch.concat([X] * 64, dim=1) + X = self.conv1d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelConv2D(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.conv2d = nn.Conv2d(5, 10, 3, bias=bias) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 5, 3, 3) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelConv2D2(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 40) + self.conv2d = nn.Conv2d(8, 32, 3) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin1 = nn.Linear(32, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.lin0(X) + X = self.relu(X) + X = X.reshape(-1, 8, 3, 3) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class ModelConv2D1x1(nn.Module): + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(1, 10, kernel_size=(1, 1), padding=0) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10 * 3 * 3, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.reshape(-1, 1, 3, 3) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelConv2DGroups(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(90, 288) + # groups is set as 8 since default r=8 + # hence to make r divisible by groups + self.conv2d = nn.Conv2d(16, 16, 3, groups=8) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin1 = nn.Linear(16, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = X.flatten() + X = self.lin0(X) + X = X.reshape(2, 16, 3, 3) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class ModelConv2DGroups2(nn.Module): + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(16, 32, 3, padding=1, groups=2) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(12800, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + # Note: needs a different input shape, thus ignore original input + X = torch.arange(9 * 16 * 20 * 20).view([9, 16, 20, 20]).to(self.conv2d.weight.device) + X = X.to(self.dtype) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelConv1DKernel1(nn.Module): + def __init__(self): + super().__init__() + self.conv1d = nn.Conv1d(in_channels=3, out_channels=10, kernel_size=1) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10 * 10, 2) + self.dtype = torch.float + + def forward(self, x): + x = x.to(self.dtype) + x = x.reshape(-1, 3, 10) # batch, channels, seq_len + x = self.conv1d(x) + x = self.relu(x) + x = self.flat(x) + x = self.lin0(x) + return x + + +class ModelConv3D(nn.Module): + def __init__(self): + super().__init__() + self.conv3d = nn.Conv3d(5, 10, 3) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + # If necessary, convert from 2D image to 3D volume + if X.dim() == 2: + X = torch.stack([X] * 3, dim=-1) + X = X.reshape(-1, 5, 3, 3, 3) + X = self.conv3d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class ModelMha(nn.Module): + def __init__(self): + super().__init__() + self.mha = nn.MultiheadAttention(10, 2) + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X, _ = self.mha(X, X, X) + X = self.lin0(X) + X = self.sm(X) + return X + + +class _LinearUsingParameter(nn.Module): + # Linear layer equivalent + def __init__(self, in_features, out_features, bias=None): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter(torch.randn(in_features, out_features)) + if bias: + self.bias = nn.Parameter(torch.ones(out_features)) + + def forward(self, x): + return x @ self.weight + self.bias + + +class MlpUsingParameters(nn.Module): + # MLP that uses layers whose parameters need to be targeted with target_parameters + def __init__(self, bias=True): + super().__init__() + + self.lin0 = _LinearUsingParameter(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = _LinearUsingParameter(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + X = X.to(self.dtype) + X = self.lin0(X) + X = self.relu(X) + X = self.drop(X) + X = self.lin1(X) + X = self.sm(X) + return X + + +class MockTransformerWrapper: + """Mock class to behave like a transformers model. + + This is needed because the tests initialize the model by calling transformers_class.from_pretrained. + + """ + + @classmethod + def from_pretrained(cls, model_id, torch_dtype=None): + # set the seed so that from_pretrained always returns the same model + torch.manual_seed(0) + + if torch_dtype is None: + torch_dtype = torch.float32 + + if model_id == "MLP": + return MLP().to(torch_dtype) + + if model_id == "EmbConv1D": + return ModelEmbConv1D().to(torch_dtype) + + if model_id == "Conv1d": + return ModelConv1D().to(torch_dtype) + + if model_id == "Conv1dBigger": + return ModelConv1DBigger().to(torch_dtype) + + if model_id == "Conv2d": + return ModelConv2D().to(torch_dtype) + + if model_id == "Conv2d1x1": + return ModelConv2D1x1().to(torch_dtype) + + if model_id == "Conv1dKernel1": + return ModelConv1DKernel1().to(torch_dtype) + + if model_id == "Conv2dGroups": + return ModelConv2DGroups().to(torch_dtype) + + if model_id == "Conv2dGroups2": + return ModelConv2DGroups2().to(torch_dtype) + + if model_id == "Conv3d": + return ModelConv3D().to(torch_dtype) + + if model_id == "MLP_LayerNorm": + return MLP_LayerNorm().to(torch_dtype) + + if model_id == "MLP2": + return MLP2().to(torch_dtype) + + if model_id == "Conv2d2": + return ModelConv2D2().to(torch_dtype) + + if model_id == "MHA": + return ModelMha().to(torch_dtype) + + if model_id == "MlpUsingParameters": + return MlpUsingParameters().to(torch_dtype) + + raise ValueError(f"model_id {model_id} not implemented") + + +class TestPeftCustomModel(PeftCommonTester): + """ + Implements the tests for custom models. + + Most tests should just call the parent class, e.g. test_save_pretrained calls self._test_save_pretrained. Override + this if custom models don't work with the parent test method. + + """ + + transformers_class = MockTransformerWrapper + + def prepare_inputs_for_testing(self): + X = torch.arange(90).view(9, 10).to(self.torch_device) + return {"X": X} + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_attributes_parametrized(self, test_name, model_id, config_cls, config_kwargs): + self._test_model_attr(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_adapter_name(self, test_name, model_id, config_cls, config_kwargs): + self._test_adapter_name(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_prepare_for_training_parametrized(self, test_name, model_id, config_cls, config_kwargs): + # This test does not work with custom models because it assumes that + # there is always a method get_input_embeddings that returns a layer + # which does not need updates. Instead, a new test is added below that + # checks that LoRA works as expected. + pass + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_save_pretrained(self, test_name, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_save_pretrained_pickle(self, test_name, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_load_model_low_cpu_mem_usage(self, test_name, model_id, config_cls, config_kwargs): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs) + self._test_load_model_low_cpu_mem_usage(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_from_pretrained_config_construction(self, test_name, model_id, config_cls, config_kwargs): + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_load_multiple_adapters(self, test_name, model_id, config_cls, config_kwargs): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs) + self._test_load_multiple_adapters(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): + # https://github.com/huggingface/peft/pull/2403 + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + pytest.skip( + f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" + ) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs): + # https://github.com/huggingface/peft/pull/2403 + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + pytest.skip( + f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" + ) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_fp16(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_merge_layers_is_idempotent(self, test_name, model_id, config_cls, config_kwargs): + # calling merge twice with the same arguments should not change the output + + # https://github.com/huggingface/peft/pull/2403 + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + pytest.skip( + f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" + ) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_is_idempotent(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_safe_merge(self, test_name, model_id, config_cls, config_kwargs): + # https://github.com/huggingface/peft/pull/2403 + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + pytest.skip( + f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" + ) + + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_safe_merge(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("safe_merge", [False, True]) + @pytest.mark.parametrize("module_type", ["linear", "conv2d"]) + def test_merge_with_lora_bias_when_base_layer_has_no_bias_warns_and_raises(self, safe_merge, module_type): + # It is not possible to merge the lora_B bias if the base layer doesn't have a bias itself. + if module_type == "linear": + model = MLP(bias=False) + config = LoraConfig(target_modules=["lin0", "lin1"], lora_bias=True) + warn_msg = re.escape("`lora_bias=True` was passed but the targeted layer of type Linear has no bias") + elif module_type == "conv2d": + model = ModelConv2D(bias=False) + config = LoraConfig(target_modules=["conv2d"], lora_bias=True) + warn_msg = re.escape("`lora_bias=True` was passed but the targeted layer of type Conv2d has no bias") + else: + raise ValueError(f"Wrong module_type passed, expected 'linear' or 'conv2d', got {module_type}") + + with pytest.warns(PeftWarning, match=warn_msg): + model = get_peft_model(model, config) + + err_msg = "Impossible to merge LoRA with `lora_bias=True` because the base layer has no bias" + with pytest.raises(RuntimeError, match=err_msg): + model.merge_adapter(safe_merge=safe_merge) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_generate(self, test_name, model_id, config_cls, config_kwargs): + # Custom models do not (necessarily) have a generate method, so this test is not performed + pass + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_generate_half_prec(self, test_name, model_id, config_cls, config_kwargs): + # Custom models do not (necessarily) have a generate method, so this test is not performed + pass + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_training_custom_models(self, test_name, model_id, config_cls, config_kwargs): + self._test_training(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_training_custom_models_layer_indexing(self, test_name, model_id, config_cls, config_kwargs): + # At the moment, layer indexing only works when layer names conform to a specific pattern, which is not + # guaranteed here. Therefore, this test is not performed. + pass + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_training_custom_models_gradient_checkpointing(self, test_name, model_id, config_cls, config_kwargs): + self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_inference_safetensors(self, test_name, model_id, config_cls, config_kwargs): + self._test_inference_safetensors(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_peft_model_device_map(self, test_name, model_id, config_cls, config_kwargs): + self._test_peft_model_device_map(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_in_features_out_features_exposed(self, test_name, model_id, config_cls, config_kwargs): + # the PEFT layer should expose the .in_features and .out_features attributes + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + assert hasattr(module, "in_features") + assert hasattr(module, "out_features") + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_forward_output_finite(self, test_name, model_id, config_cls, config_kwargs): + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model.eval() + with torch.no_grad(): + output = model(**X) + assert torch.isfinite(output).all() + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): + # The user manually sets the dtype of the base model to fp16 precision. This should not cause an error for the + # different PEFT methods. + try: + torch.zeros(1, dtype=torch.float16) + except Exception: + # skip this test if float16 is not supported on this machine + pytest.skip(reason="Test requires float16 support") + + # skip on MacOS + if platform.system() == "Darwin": + pytest.skip(reason="MacOS does not support multiple ops in float16") + + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.float16).to(self.torch_device) + model.dtype = torch.float16 + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model.eval() + + # check that none of this raises an error + model(**X) + + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + # this model does not support merging + return + + model.merge_adapter(safe_merge=False) + model(**X) + model.unmerge_adapter() + model(**X) + model.merge_adapter(safe_merge=True) + model(**X) + model.unmerge_adapter() + model(**X) + model = model.merge_and_unload() + model(**X) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): + # The user manually sets the dtype of the base model to bf16 precision. This should not cause an error for the + # different PEFT methods. + try: + torch.zeros(1, dtype=torch.bfloat16) + except Exception: + # skip this test if float16 is not supported on this machine + pytest.skip(reason="Test requires bfloat16 support") + + # skip on MacOS + if platform.system() == "Darwin": + pytest.skip(reason="MacOS does not support multiple ops in bfloat16") + + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(self.torch_device) + model.dtype = torch.bfloat16 + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model.eval() + + # check that none of this raises an error + model(**X) + + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + # this model does not support merging + return + + model.merge_adapter(safe_merge=False) + model(**X) + model.unmerge_adapter() + model(**X) + model.merge_adapter(safe_merge=True) + model(**X) + model.unmerge_adapter() + model(**X) + model = model.merge_and_unload() + model(**X) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, config_kwargs): + # Same as above but don't autocast adapter weights to float32 automatically + try: + torch.zeros(1, dtype=torch.float16) + except Exception: + # skip this test if float16 is not supported on this machine + pytest.skip(reason="Test requires float16 support") + + # skip on MacOS + if platform.system() == "Darwin": + pytest.skip(reason="MacOS does not support multiple ops in float16") + + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.float16).to(self.torch_device) + model.dtype = torch.float16 + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config, autocast_adapter_dtype=False) + model.eval() + + # check that none of this raises an error + model(**X) + + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + # this model does not support merging + return + + model.merge_adapter(safe_merge=False) + model(**X) + model.unmerge_adapter() + model(**X) + model.merge_adapter(safe_merge=True) + model(**X) + model.unmerge_adapter() + model(**X) + model = model.merge_and_unload() + model(**X) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, config_kwargs): + # Same as above but don't autocast adapter weights to float32 automatically + try: + torch.zeros(1, dtype=torch.bfloat16) + except Exception: + # skip this test if float16 is not supported on this machine + pytest.skip(reason="Test requires bfloat16 support") + + # skip on MacOS + if platform.system() == "Darwin": + pytest.skip(reason="MacOS does not support multiple ops in bfloat16") + + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(self.torch_device) + model.dtype = torch.bfloat16 + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config, autocast_adapter_dtype=False) + model.eval() + + # check that none of this raises an error + model(**X) + + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + # this model does not support merging + return + + model.merge_adapter(safe_merge=False) + model(**X) + model.unmerge_adapter() + model(**X) + model.merge_adapter(safe_merge=True) + model(**X) + model.unmerge_adapter() + model(**X) + model = model.merge_and_unload() + model(**X) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_only_params_are_updated(self, test_name, model_id, config_cls, config_kwargs): + # An explicit test that when using an adapter on a custom model, only the adapter parameters are updated during + # training + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model_before = copy.deepcopy(model) + + model.train() + lr = 0.5 + if (config_kwargs.get("use_dora") and model_id == "EmbConv1D") or issubclass(config_cls, VBLoRAConfig): + # this high learning rate was found through testing to be necessary to avoid flakiness + lr = 100 + elif "mha" in model_id.lower(): + # we get exploding gradients with MHA when learning rate is too high + lr = 1e-3 + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry + # breaking of some LoRA layers that are initialized with constants) + for _ in range(3): + optimizer.zero_grad() + y_pred = model(**X) + loss = y_pred.sum() + loss.backward() + optimizer.step() + + tol = 1e-4 + params_before = dict(model_before.named_parameters()) + params_after = dict(model.named_parameters()) + assert params_before.keys() == params_after.keys() + + prefix = PREFIXES[config_cls] + for name, param_before in params_before.items(): + param_after = params_after[name] + if (prefix in name) or ("modules_to_save" in name) or ("token_adapter.trainable_tokens" in name): + # target_modules, modules_to_save and modules of `NewTokensWrapper` _are_ updated + assert not torch.allclose(param_before, param_after, atol=tol, rtol=tol) + else: + assert torch.allclose(param_before, param_after, atol=tol, rtol=tol) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_parameters_after_loading_model(self, test_name, model_id, config_cls, config_kwargs): + # An explicit test that when loading a trained model, the parameters are loaded correctly + # see issue #808 + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model.train() + + lr = 0.5 + if config_kwargs.get("use_dora"): + lr = 0.1 # otherwise we get nan + elif "mha" in model_id.lower(): + lr = 1e-3 # we get exploding gradients with MHA when learning rate is too high + elif issubclass(config_cls, VBLoRAConfig) or issubclass(config_cls, RandLoraConfig): + lr = 0.01 # otherwise we get nan + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry + # breaking of some LoRA layers that are initialized with constants) + for _ in range(3): + optimizer.zero_grad() + y_pred = model(**X) + loss = y_pred.sum() + loss.backward() + optimizer.step() + + tol = 1e-4 + params_before = get_state_dict(model) + # note: no need to sanity check if parameters were updated at all, this + # is already covered in the previous test + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + model_from_pretrained = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + params_after = get_state_dict(model_from_pretrained) + + assert params_before.keys() == params_after.keys() + for name, param_before in params_before.items(): + param_after = params_after[name] + assert torch.allclose(param_before, param_after, atol=tol, rtol=tol) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_disable_adapters(self, test_name, model_id, config_cls, config_kwargs): + # Test that it's possible to disable the adapter, in which case the model output should be identical to that of + # the base model. + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device).eval() + outputs_base = model(**X) + + if issubclass(config_cls, (TrainableTokensConfig,)): + config_kwargs = config_kwargs.copy() + # override the default value and make PEFT operation a no-op + config_kwargs["init_weights"] = True + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + if issubclass(config_cls, VBLoRAConfig): + # Manually set the `vblora_vector_bank` to zero so that VB-LoRA functions as an identity operation. + torch.nn.init.zeros_(model.vblora_vector_bank["default"]) + model.eval() + outputs_before = model(**X) + assert torch.allclose(outputs_base, outputs_before) + + if issubclass(config_cls, VBLoRAConfig): + # initialize `vblora_vector_bank` so it can be trained + model._init_vblora_vector_bank(config, "default") + model.train() + # EmbConv1D is slow to learn for some reason + lr = 0.01 if model_id != "EmbConv1D" else 1.0 + if isinstance(config, TrainableTokensConfig): + # TrainableTokens is only changing a small subset, so we need a higher lr to see the difference + lr = 2.0 + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + + # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry + # breaking of some LoRA layers that are initialized with constants) + for _ in range(3): + optimizer.zero_grad() + y_pred = model(**X) + y = torch.arange(len(y_pred)).to(self.torch_device) % 2 + loss = nn.functional.nll_loss(y_pred, y) + loss.backward() + optimizer.step() + + model.eval() + outputs_after = model(**X) + + with model.disable_adapter(): + outputs_disabled = model(**X) + + # check that after leaving the disable_adapter context, everything is enabled again + outputs_enabled_after_disable = model(**X) + + if self.torch_device == "cpu": + # LayerNorm is running float32 on cpu, so difference in outputs are smaller + rtol, atol = 1e-8, 1e-8 + else: + rtol, atol = 1e-5, 1e-8 + assert not torch.allclose(outputs_before, outputs_after, rtol=rtol, atol=atol) + assert torch.allclose(outputs_before, outputs_disabled) + assert torch.allclose(outputs_after, outputs_enabled_after_disable) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, config_kwargs): + # Same test as test_disable_adapters, but additionally merge the trained adapter. + + # https://github.com/huggingface/peft/pull/2403 + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + pytest.skip( + f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" + ) + + # same as test_disable_adapters, but with merging + X = self.prepare_inputs_for_testing() + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + if issubclass(config_cls, VBLoRAConfig): + # Manually set the `vblora_vector_bank` to zero so that VB-LoRA functions as an identity operation. + torch.nn.init.zeros_(model.vblora_vector_bank["default"]) + model.eval() + outputs_before = model(**X) + + if issubclass(config_cls, VBLoRAConfig): + # initialize `vblora_vector_bank` so it can be trained + model._init_vblora_vector_bank(config, "default") + model.train() + if isinstance(config_cls, LNTuningConfig): + # LayerNorm tuning is slow to learn + lr = 1.0 + optimizer = torch.optim.SGD(model.parameters(), lr=lr) + else: + # Adam optimizer since SGD isn't great for small models with IA3 + Conv1D + lr = 0.01 + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + + # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry + # breaking of some LoRA layers that are initialized with constants) + for _ in range(3): + optimizer.zero_grad() + y_pred = model(**X) + y = torch.arange(len(y_pred)).to(self.torch_device) % 2 + loss = nn.functional.nll_loss(y_pred, y) + loss.backward() + optimizer.step() + + model.eval() + outputs_unmerged = model(**X) + model.merge_adapter() + outputs_after = model(**X) + + with model.disable_adapter(): + outputs_disabled = model(**X) + + # check that after leaving the disable_adapter context, everything is enabled again + outputs_enabled_after_disable = model(**X) + + atol, rtol = 1e-5, 1e-5 # tolerances higher than defaults since merging introduces some numerical instability + + conv_ids = ["Conv2d", "Conv3d", "Conv2d2"] + if issubclass(config_cls, (IA3Config, LoraConfig)) and model_id in conv_ids: # more instability with Conv + atol, rtol = 1e-3, 1e-3 + + if issubclass(config_cls, OFTConfig): + atol, rtol = 1e-4, 1e-4 + + if config_kwargs.get("use_dora") and model_id == "EmbConv1D": + atol, rtol = 1e-4, 1e-4 + + # check that there is a difference in results after training + assert not torch.allclose(outputs_before, outputs_after, atol=atol, rtol=rtol) + + if self.torch_device in ["mlu"] and model_id in conv_ids: + atol, rtol = 1e-3, 1e-2 # MLU + + # unmerged or merged should make no difference + assert torch.allclose(outputs_after, outputs_unmerged, atol=atol, rtol=rtol) + + # check that disabling adapters gives the same results as before training + assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol) + + # check that enabling + disabling adapters does not change the results + assert torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_disable_adapter_with_bias_warns(self, test_name, model_id, config_cls, config_kwargs): + # When training biases in lora, disabling adapters does not reset the biases, so the output is not what users + # might expect. Therefore, a warning should be given. + + # Note: We test only with custom models since they run really fast. There is really no point in testing the same + # thing with decoder, encoder_decoder, etc. + if config_cls != LoraConfig or config_cls != BOFTConfig: + # skip this test for other configs as bias is specific to Lora + pytest.skip("Testing bias warnings only for LoraConfig or BOFTConfig") + if not issubclass(config_cls, (LoraConfig, BOFTConfig)): + pytest.skip("Bias argument is only supported for LoRA or BOFT models") + + def run_with_disable(config_kwargs, bias): + config_kwargs = config_kwargs.copy() + config_kwargs["bias"] = bias + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + peft_model = get_peft_model(model, config) + with peft_model.disable_adapter(): + pass # there is nothing to be done + + if config_cls == LoraConfig: + # check that bias=all and bias=lora_only give a warning with the correct message + msg_start = "Careful, disabling adapter layers with bias configured to be" + with pytest.warns(UserWarning, match=msg_start): + run_with_disable(config_kwargs, bias="lora_only") + with pytest.warns(UserWarning, match=msg_start): + run_with_disable(config_kwargs, bias="all") + + if config_cls == BOFTConfig: + # check that bias=all and bias=boft_only give a warning with the correct message + msg_start = "Careful, disabling adapter layers with bias configured to be" + with pytest.warns(UserWarning, match=msg_start): + run_with_disable(config_kwargs, bias="boft_only") + with pytest.warns(UserWarning, match=msg_start): + run_with_disable(config_kwargs, bias="all") + + # For bias=none, there is no warning. Unfortunately, AFAIK unittest has no option to assert that no warning is + # given, therefore, we check that the unittest gives us an AssertionError if we check for a warning + bias_warning_was_given = False + try: + with pytest.warns(UserWarning) as cm: + run_with_disable(config_kwargs, bias="none") + # if we get here, it means there was no AssertionError, i.e. there are warnings -- let's check that they + # are not related to the bias setting + if any(warning.message.args[0].startswith(msg_start) for warning in cm.warnings): + bias_warning_was_given = True + except AssertionError: + # This is good, there was an AssertionError, i.e. there was no warning + pass + if bias_warning_was_given: + # This is bad, there was a warning about the bias when there should not have been any. + self.fail("There should be no warning when bias is set to 'none'") + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_active_adapter(self, test_name, model_id, config_cls, config_kwargs): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs) + if config_kwargs.get("modules_to_save", []) or config_kwargs.get("trainable_token_indices", []): + pytest.skip("Multiple active adapters with modules_to_save/trainable_token_indices is not supported.") + + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + assert model.active_adapters == ["default"] + assert model.active_adapter == "default" + + # at this stage, "default" is still the activate adapter, "other" is disabled + model.add_adapter("other", config) + assert model.active_adapters == ["default"] + assert model.active_adapter == "default" + + # set "other" as the active adapter + model.set_adapter("other") + assert model.active_adapters == ["other"] + assert model.active_adapter == "other" + + # set both adapters as active + # Note: On the PeftModel, there cannot be multiple active adapters, so we have to go through model.base_model + # instead. + model.base_model.set_adapter(["default", "other"]) + # model.active_adapters works, as it delegates to the base_model + assert model.active_adapters == ["default", "other"] + # model.active_adapter would not work, thus we have to check the base_model directly + assert model.base_model.active_adapter == ["default", "other"] + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_set_adapter_non_overlapping_modules(self, config_cls): + # Ensure that when setting multiple adapters, the active adapters are correctly being set, even if + # target_modules that only overlap partially. Apart from checking model.set_adapter, also check + # model.base_model.set_adapter. Normally, users wouldn't call this, but there are situations where this is + # required, e.g. activating multiple adapters at once cannot be done on the PeftModel but works on LoraModel + # etc. + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Model has no embedding layer, skipping TrainableTokensConfig.") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # target_modules overlap partially + config0 = config_cls(target_modules=["layers.0.lin0", "layers.1.lin0"], **extra_kwargs) + config1 = config_cls(target_modules=["layers.1.lin0", "layers.2.lin0"], **extra_kwargs) + model = get_peft_model(model, config0, adapter_name="default") + model.add_adapter("other", config1) + + # at this point, 'default' is active + assert model.base_model.active_adapters == ["default"] + # general note: for adapter layers like LoRA, the active_adapters can be "default" even if that layer has no + # adapter called "default", it will be simply ignored in that case + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[1].lin0.active_adapters == ["default"] + assert model.base_model.layers[2].lin0.active_adapters == ["default"] + + # activate 'other' + model.set_adapter("other") + assert model.base_model.active_adapters == ["other"] + assert model.base_model.layers[0].lin0.active_adapters == ["other"] + assert model.base_model.layers[1].lin0.active_adapters == ["other"] + assert model.base_model.layers[2].lin0.active_adapters == ["other"] + + # go back to 'default' + model.set_adapter("default") + assert model.base_model.active_adapters == ["default"] + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[1].lin0.active_adapters == ["default"] + assert model.base_model.layers[2].lin0.active_adapters == ["default"] + + # also ensure that model.base_model.set_adapter works as expected + # activate 'other' + model.base_model.set_adapter(["other"]) + assert model.base_model.active_adapters == ["other"] + assert model.base_model.layers[0].lin0.active_adapters == ["other"] + assert model.base_model.layers[1].lin0.active_adapters == ["other"] + assert model.base_model.layers[2].lin0.active_adapters == ["other"] + + # go back to 'default' + model.base_model.set_adapter(["default"]) + assert model.base_model.active_adapters == ["default"] + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[1].lin0.active_adapters == ["default"] + assert model.base_model.layers[2].lin0.active_adapters == ["default"] + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_set_adapter_non_overlapping_modules_to_save(self, config_cls): + # This is similar to the previous test, but includes modules_to_save. Specifically, there was a bug where adding + # config1 would automatically activate the modules_to_save for 'other' + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Trainable tokens does not support modules_to_save") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # targeting the same modules with modules_to_save: + config0 = config_cls(target_modules=["layers.0.lin0"], **extra_kwargs) + config1 = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1"], **extra_kwargs) + model = get_peft_model(model, config0, adapter_name="default") + model.add_adapter("other", config1) + + # at this point, 'default' is active + assert model.base_model.active_adapters == ["default"] + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[0].lin1.active_adapters == [] + assert model.base_model.layers[0].lin1.modules_to_save.other.weight.requires_grad is False + + # activate 'other' + model.set_adapter("other") + assert model.base_model.active_adapters == ["other"] + assert model.base_model.layers[0].lin0.active_adapters == ["other"] + assert model.base_model.layers[0].lin1.active_adapters == ["other"] + assert model.base_model.layers[0].lin1.modules_to_save.other.weight.requires_grad is True + + # go back to 'default' + model.set_adapter("default") + assert model.base_model.active_adapters == ["default"] + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[0].lin1.active_adapters == [] + assert model.base_model.layers[0].lin1.modules_to_save.other.weight.requires_grad is False + + # also ensure that model.base_model.set_adapter works as expected + # activate 'other' + model.base_model.set_adapter(["other"]) + assert model.base_model.active_adapters == ["other"] + assert model.base_model.layers[0].lin0.active_adapters == ["other"] + assert model.base_model.layers[0].lin1.active_adapters == ["other"] + assert model.base_model.layers[0].lin1.modules_to_save.other.weight.requires_grad is True + + # go back to 'default' + model.base_model.set_adapter(["default"]) + assert model.base_model.active_adapters == ["default"] + assert model.base_model.layers[0].lin0.active_adapters == ["default"] + assert model.base_model.layers[0].lin1.active_adapters == [] + assert model.base_model.layers[0].lin1.modules_to_save.other.weight.requires_grad is False + + def test_set_adapter_non_overlapping_trainable_token_indices(self): + # Same test as the previous one, but using trainable_token_indices instead of modules_to_save + model = ModelEmbConv1D() + # targeting the same modules with modules_to_save: + config0 = LoraConfig(target_modules=["lin0"]) + config1 = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0]}) + + model = get_peft_model(model, config0, adapter_name="default") + model.add_adapter("other", config1) + + # at this point, 'default' is active + assert model.base_model.active_adapters == ["default"] + assert model.base_model.lin0.active_adapters == ["default"] + assert model.base_model.emb.active_adapters == [] + assert model.base_model.model.emb.token_adapter.trainable_tokens_delta.other.requires_grad is False + + # activate 'other' + model.set_adapter("other") + assert model.base_model.active_adapters == ["other"] + assert model.base_model.lin0.active_adapters == ["other"] + assert model.base_model.emb.active_adapters == ["other"] + assert model.base_model.model.emb.token_adapter.trainable_tokens_delta.other.requires_grad is True + + # go back to 'default' + model.set_adapter("default") + assert model.base_model.active_adapters == ["default"] + assert model.base_model.lin0.active_adapters == ["default"] + assert model.base_model.emb.active_adapters == [] + assert model.base_model.model.emb.token_adapter.trainable_tokens_delta.other.requires_grad is False + + # also ensure that model.base_model.set_adapter works as expected + # activate 'other' + model.base_model.set_adapter(["other"]) + assert model.base_model.active_adapters == ["other"] + assert model.base_model.lin0.active_adapters == ["other"] + assert model.base_model.emb.active_adapters == ["other"] + assert model.base_model.model.emb.token_adapter.trainable_tokens_delta.other.requires_grad is True + + # go back to 'default' + model.base_model.set_adapter(["default"]) + assert model.base_model.active_adapters == ["default"] + assert model.base_model.lin0.active_adapters == ["default"] + assert model.base_model.emb.active_adapters == [] + assert model.base_model.model.emb.token_adapter.trainable_tokens_delta.other.requires_grad is False + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_multiple_active_adapters_with_same_modules_to_save_raises(self, config_cls): + # When we have multiple adapters each with modules_to_save, we don't allow those to target the same layer, as + # module_to_save (unlike LoRA etc) is not additive. + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Trainable tokens does not support modules_to_save") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # targeting the same modules with modules_to_save: + config0 = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1"], **extra_kwargs) + config1 = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1"], **extra_kwargs) + model = get_peft_model(model, config0, adapter_name="default") + # adding the adapter is fine + model.add_adapter("other", config1) + + msg = "Only one adapter can be set at a time for ModulesToSaveWrapper" + with pytest.raises(ValueError, match=msg): + model.base_model.set_adapter(["default", "other"]) + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_multiple_active_adapters_with_overlapping_modules_to_save_raises(self, config_cls): + # same test as the previous one, but targeting multiple modules_to_save, some of which overlap + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Trainable tokens does not support modules_to_save") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # targeting the overlapping modules with modules_to_save: + config0 = config_cls( + target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1", "layers.1.lin1"], **extra_kwargs + ) + config1 = config_cls( + target_modules=["0layers..lin0"], modules_to_save=["layers.2.lin1", "layers.1.lin1"], **extra_kwargs + ) + model = get_peft_model(model, config0, adapter_name="default") + # adding the adapter is fine + model.add_adapter("other", config1) + + msg = "Only one adapter can be set at a time for ModulesToSaveWrapper" + with pytest.raises(ValueError, match=msg): + model.base_model.set_adapter(["default", "other"]) + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_multiple_active_adapters_with_different_modules_to_save_works(self, config_cls): + # same test as the previous one but targeting distinct modules_to_save; this is fine + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Trainable tokens does not support modules_to_save") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # targeting the different modules with modules_to_save: + config0 = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1"], **extra_kwargs) + config1 = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.1.lin1"], **extra_kwargs) + model = get_peft_model(model, config0, adapter_name="default") + # adding the adapter is fine + model.add_adapter("other", config1) + model.base_model.set_adapter(["default", "other"]) # does not raise + + assert model.base_model.model.layers[0].lin1.active_adapters == ["default"] + assert model.base_model.model.layers[1].lin1.active_adapters == ["other"] + + def test_multiple_active_adapters_with_same_trainable_token_indices_raises(self): + # Same test as test_multiple_active_adapters_with_same_modules_to_save_raises but with trainable_token_indices + # instead of modules_to_save. + model = ModelEmbConv1D() + # targeting the same modules with modules_to_save: + config0 = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0]}) + config1 = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0]}) + model = get_peft_model(model, config0, adapter_name="default") + # adding the adapter is fine + model.add_adapter("other", config1) + + msg = "Only one adapter can be set at a time for TrainableTokensWrapper" + with pytest.raises(ValueError, match=msg): + model.base_model.set_adapter(["default", "other"]) + + def test_multiple_active_adapters_with_different_trainable_token_indices_works(self): + # Same test as the previous one but targeting different embedding layers should work + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.emb0 = nn.Embedding(10, 10) + self.emb1 = nn.Embedding(10, 10) + self.lin0 = nn.Linear(10, 10) + + model = MyModel() + # targeting the same modules with modules_to_save: + config0 = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb0": [0]}) + config1 = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb1": [0]}) + model = get_peft_model(model, config0, adapter_name="default") + # adding the adapter is fine + model.add_adapter("other", config1) + model.base_model.set_adapter(["default", "other"]) # does not raise + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_disable_adapters_exiting_context_restores_previous_state( + self, test_name, model_id, config_cls, config_kwargs + ): + # Test that when we exit the disable_adapter context, we correctly restore the enabled state of the modules as + # they were before the context. + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + tuner_modules = [module for module in model.modules() if isinstance(module, BaseTunerLayer)] + + # all layers should be enabled + assert all(not module.disable_adapters for module in tuner_modules) + with model.disable_adapter(): + pass + # this should not change after exiting the context + assert all(not module.disable_adapters for module in tuner_modules) + + # now disable all layers + model.disable_adapter_layers() + assert all(module.disable_adapters for module in tuner_modules) + with model.disable_adapter(): + pass + assert all(module.disable_adapters for module in tuner_modules) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_disable_adapters_exiting_context_irregular_state(self, test_name, model_id, config_cls, config_kwargs): + # When we have a model where some adapters are enabled and others are disabled, we should get a warning when + # entering the disable_adapter context because we cannot correctly restore the state of the adapters from + # before the context. After exiting the context, all adapters will be enabled, which is the status quo of how + # we deal with this. + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + tuner_modules = [module for module in model.modules() if isinstance(module, BaseTunerLayer)] + + # now we mix the states, some enabled some not + if len(tuner_modules) < 2: + # next check only works with more than 1 tuner module + return + + # disable a single layer + tuner_modules[0].enable_adapters(False) + # sanity check that we have both enabled and disabled layers + assert {module.disable_adapters for module in tuner_modules} == {True, False} + # check that we get a warning with irregular states + msg = "The model contains some adapter layers that are enabled and others that are disabled" + with pytest.warns(UserWarning, match=msg): + with model.disable_adapter(): + pass + + # when encountering irregular adapters, we enable all adapters at the end of the context + assert all(not module.disable_adapters for module in tuner_modules) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_delete_adapter(self, test_name, model_id, config_cls, config_kwargs): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs) + self._test_delete_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_delete_inactive_adapter(self, test_name, model_id, config_cls, config_kwargs): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs) + self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_delete_unknown_adapter_raises(self, test_name, model_id, config_cls, config_kwargs): + self._test_delete_unknown_adapter_raises(model_id, config_cls, config_kwargs) + + def test_delete_adapter_with_multiple_adapters_works(self): + # Add 3 adapters, delete the active one, the next one should be active, delete the inactive one, the active one + # should stay the same. + config0 = LoraConfig(target_modules=["lin0"]) + config1 = LoraConfig(target_modules=["lin0"]) + config2 = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(MLP(), config0, adapter_name="adapter0").to(self.torch_device) + model.add_adapter("adapter1", config1) + model.add_adapter("adapter2", config2) + + inputs = self.prepare_inputs_for_testing() + assert model.active_adapters == ["adapter0"] + model(**inputs) # does not raise + + # delete the active adapter, next one should become active + model.delete_adapter("adapter0") + assert model.active_adapters == ["adapter1"] + model(**inputs) # does not raise + + # delete an inactive adapter, should not affect the active adapter + model.delete_adapter("adapter2") + assert model.active_adapters == ["adapter1"] + model(**inputs) # does not raise + + def test_delete_adapter_multiple_adapters_with_modules_to_save(self): + # There are 3 adapters. Adapter 0 has modules_to_save. Delete it, we should switch to adapter 1, which does not + # have modules_to_save. Then, we delete it too, switching to adapter 2, which has modules_to_save. Finally, we + # delete the last adapter (state is updated but forward is no longer possible). + model = MLP() + inputs = self.prepare_inputs_for_testing() + + config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + config1 = LoraConfig(target_modules=["lin0"]) + config2 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config0, adapter_name="adapter0").to(self.torch_device) + model.add_adapter("adapter1", config1) + model.add_adapter("adapter2", config2) + + assert model.active_adapters == ["adapter0"] + assert model.modules_to_save == {"lin1"} + assert set(model.base_model.model.lin1.modules_to_save) == {"adapter0", "adapter2"} + model(**inputs) # does not raise + + # delete active adapter, should switch to the next adapter (which does not have modules_to_save) + model.delete_adapter("adapter0") + assert model.active_adapters == ["adapter1"] + assert model.modules_to_save == {"lin1"} + assert set(model.base_model.model.lin1.modules_to_save) == {"adapter2"} + model(**inputs) # does not raise + + # delete active adapter, should switch to the next adapter (which *does* have modules_to_save) + model.delete_adapter("adapter1") + assert model.active_adapters == ["adapter2"] + assert model.modules_to_save == {"lin1"} + assert set(model.base_model.model.lin1.modules_to_save) == {"adapter2"} + model(**inputs) # does not raise + + # delete last adapter + model.delete_adapter("adapter2") + assert model.active_adapters == [] + assert model.modules_to_save is None + assert set(model.base_model.model.lin1.modules_to_save) == set() + + def test_delete_adapter_multiple_adapters_with_trainable_token_indices(self): + # Same as the previous test, just using trainable_token_indices instead of modules_to_save + # Note that we need to use a transformers model for trainable_token_indices + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + inputs = {"input_ids": torch.arange(10).view(-1, 1).to(self.torch_device)} + + config0 = LoraConfig(target_modules=["q_proj"], trainable_token_indices=[0, 1]) + config1 = LoraConfig(target_modules=["q_proj"]) + config2 = LoraConfig(target_modules=["q_proj"], trainable_token_indices=[1, 3]) + model = get_peft_model(model, config0, adapter_name="adapter0").to(self.torch_device) + model.add_adapter("adapter1", config1) + model.add_adapter("adapter2", config2) + + embed_tokens = model.base_model.model.model.decoder.embed_tokens + lm_head = model.base_model.model.lm_head + + assert model.active_adapters == ["adapter0"] + assert set(embed_tokens.token_adapter.trainable_tokens_delta) == {"adapter0", "adapter2"} + assert set(embed_tokens.token_adapter.trainable_tokens_original) == {"adapter0", "adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_delta) == {"adapter0", "adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_original) == {"adapter0", "adapter2"} + model(**inputs) # does not raise + + # delete active adapter, should switch to the next adapter (which does not have modules_to_save) + model.delete_adapter("adapter0") + assert model.active_adapters == ["adapter1"] + assert set(embed_tokens.token_adapter.trainable_tokens_delta) == {"adapter2"} + assert set(embed_tokens.token_adapter.trainable_tokens_original) == {"adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_delta) == {"adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_original) == {"adapter2"} + model(**inputs) # does not raise + + # delete active adapter, should switch to the next adapter (which *does* have modules_to_save) + model.delete_adapter("adapter1") + assert model.active_adapters == ["adapter2"] + assert set(embed_tokens.token_adapter.trainable_tokens_delta) == {"adapter2"} + assert set(embed_tokens.token_adapter.trainable_tokens_original) == {"adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_delta) == {"adapter2"} + assert set(lm_head.token_adapter.trainable_tokens_original) == {"adapter2"} + model(**inputs) # does not raise + + # delete last adapter + model.delete_adapter("adapter2") + assert model.active_adapters == [] + assert set(embed_tokens.token_adapter.trainable_tokens_delta) == set() + assert set(embed_tokens.token_adapter.trainable_tokens_original) == set() + assert set(lm_head.token_adapter.trainable_tokens_delta) == set() + assert set(lm_head.token_adapter.trainable_tokens_original) == set() + + @pytest.mark.parametrize("test_name, model_id, config_cls, config_kwargs", TEST_CASES) + def test_adding_multiple_adapters_with_bias_raises(self, test_name, model_id, config_cls, config_kwargs): + self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs) + + @staticmethod + def _check_requires_grad(module, adapter_name, requires_grad): + # a bit of a clumsy way to test requires_grad on the PEFT parameters + for name in module.adapter_layer_names: + module_dict = getattr(module, name) + if adapter_name not in module_dict: + continue + attr = module_dict[adapter_name] + if isinstance(attr, nn.Module): + for param in attr.parameters(): + assert param.requires_grad == requires_grad + else: # it's an nn.Parameter + assert attr.requires_grad == requires_grad + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + def test_set_requires_grad(self, config_cls): + # checks that the model.set_requires_grad method works as expected + if config_cls == TrainableTokensConfig: + pytest.skip( + "TrainableTokensConfig has a separate test for set_requires_grad, as it needs a different model." + ) + + config_kwargs = {"target_modules": ["layers.0.lin0"]} + if config_cls == IA3Config: + config_kwargs["feedforward_modules"] = [] + config0 = config_cls(**config_kwargs) + model = DeepMLP(size=256) # a size that works with all adapters + model = get_peft_model(model, config0, adapter_name="adapter0").eval() + + # check that it works with a single adapter + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter0", requires_grad=True) + + # add another adapter with two target modules and with modules_to_save + config_kwargs["target_modules"] = ["layers.0.lin0", "layers.1.lin0"] + config_kwargs["modules_to_save"] = ["layers.2.lin0"] + config1 = config_cls(**config_kwargs) + model.add_adapter("adapter1", config1) + + # adapter0 still has requires_grad=True, adapter1 has requires_grad=False + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter0", requires_grad=True) + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter1", requires_grad=False) + self._check_requires_grad(model.base_model.model.layers[1].lin0, adapter_name="adapter1", requires_grad=False) + self._check_requires_grad(model.base_model.model.layers[2].lin0, adapter_name="adapter1", requires_grad=False) + + # enable grad for adapter1; adapter0 is unaffected + model.set_requires_grad(adapter_names="adapter1") + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter0", requires_grad=True) + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter1", requires_grad=True) + self._check_requires_grad(model.base_model.model.layers[1].lin0, adapter_name="adapter1", requires_grad=True) + self._check_requires_grad(model.base_model.model.layers[2].lin0, adapter_name="adapter1", requires_grad=True) + + # disable adapter for both + model.set_requires_grad(adapter_names=["adapter0", "adapter1"], requires_grad=False) + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter0", requires_grad=False) + self._check_requires_grad(model.base_model.model.layers[0].lin0, adapter_name="adapter1", requires_grad=False) + self._check_requires_grad(model.base_model.model.layers[1].lin0, adapter_name="adapter1", requires_grad=False) + + def test_set_requires_grad_trainable_tokens(self): + # same as test_set_requires_grad for trainable tokens + class EmbModel(nn.Module): + def __init__(self): + super().__init__() + self.emb0 = nn.Embedding(10, 10) + self.emb1 = nn.Embedding(10, 10) + + config_kwargs = {"target_modules": ["emb0"], "token_indices": [0, 2, 4]} + config0 = TrainableTokensConfig(**config_kwargs) + model = EmbModel() + model = get_peft_model(model, config0, adapter_name="adapter0").eval() + + # check that it works with a single adapter + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter0", requires_grad=True) + + # add another adapter which targets 2 embedding layers + config_kwargs["target_modules"] = ["emb0", "emb1"] + config1 = TrainableTokensConfig(**config_kwargs) + model.add_adapter("adapter1", config1) + + # adapter0 still has requires_grad=True, adapter1 has requires_grad=False + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter0", requires_grad=True) + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter1", requires_grad=False) + self._check_requires_grad(model.base_model.model.emb1, adapter_name="adapter1", requires_grad=False) + + # enable grad for adapter1; adapter0 is unaffected + model.set_requires_grad(adapter_names="adapter1") + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter0", requires_grad=True) + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter1", requires_grad=True) + self._check_requires_grad(model.base_model.model.emb1, adapter_name="adapter1", requires_grad=True) + + # disable adapter for both + model.set_requires_grad(adapter_names=["adapter0", "adapter1"], requires_grad=False) + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter0", requires_grad=False) + self._check_requires_grad(model.base_model.model.emb0, adapter_name="adapter1", requires_grad=False) + self._check_requires_grad(model.base_model.model.emb1, adapter_name="adapter1", requires_grad=False) + + def test_weight_bias_attributes(self): + model = MLP() + config = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(model, config) + assert hasattr(model.base_model.model.lin0, "weight") + assert hasattr(model.base_model.model.lin0, "bias") + + def test_multiple_adapters_automatic_modules_to_save(self): + # See issue 1574 + # When we use certain task types, PeftModel.modules_to_save is automatically updated to include some extra + # layers not specified in the PeftConfig. This attribute should be honored for all adapters, not just for + # the default adapter. + config0 = LoraConfig(task_type=TaskType.SEQ_CLS) + config1 = LoraConfig(task_type=TaskType.SEQ_CLS) + model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") + model = get_peft_model(model, config0) + # sanity check + assert model.modules_to_save + + model.add_adapter("other", config1) + assert "default" in model.base_model.classifier.modules_to_save + assert "other" in model.base_model.classifier.modules_to_save + + @pytest.mark.parametrize( + "config_cls", [IA3Config, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, BoneConfig, ShiraConfig, MissConfig] + ) + def test_multiple_adapters_mixed_modules_to_save(self, config_cls): + # See issue 1574 + # Check that we can have a model where one adapter has modules_to_save and the other doesn't. It should be + # possible to switch between those adapters and to use them. + if hasattr(config_cls, "feedforward_modules"): # IA³ + config_cls = partial(config_cls, feedforward_modules=["lin0"]) + + if config_cls == BoneConfig or config_cls == MissConfig: + config_cls = partial(config_cls, r=2) + if config_cls == ShiraConfig: + config_cls = partial(config_cls, r=1) + + config0 = config_cls(target_modules=["lin0"], modules_to_save=["lin1"]) + config1 = config_cls(target_modules=["lin0"]) + model = MLP() + model = get_peft_model(model, config0).to(self.torch_device) + model.add_adapter("other", config1) + + assert "default" in model.base_model.lin1.modules_to_save + assert "other" not in model.base_model.lin1.modules_to_save + + # check that switching adapters and predicting does not raise + inputs = self.prepare_inputs_for_testing() + # "default" adapter is active + model(**inputs) + # switch to "other" adapter + model.set_adapter("other") + model(**inputs) + + @pytest.mark.parametrize( + "config_cls", [IA3Config, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, BoneConfig, ShiraConfig] + ) + def test_multiple_adapters_mixed_modules_to_save_order_switched(self, config_cls): + # See issue 1574 + # Same test as test_multiple_adapters_mixed_modules_to_save, but this time the 2nd adapter has modules_to_save. + if hasattr(config_cls, "feedforward_modules"): # IA³ + config_cls = partial(config_cls, feedforward_modules=["lin0"]) + + if config_cls == BoneConfig or config_cls == MissConfig: + config_cls = partial(config_cls, r=2) + if config_cls == ShiraConfig: + config_cls = partial(config_cls, r=1) + + config0 = config_cls(target_modules=["lin0"]) + config1 = config_cls(target_modules=["lin0"], modules_to_save=["lin1"]) + model = MLP() + model = get_peft_model(model, config0).to(self.torch_device) + model.add_adapter("other", config1) + + assert "default" not in model.base_model.lin1.modules_to_save + assert "other" in model.base_model.lin1.modules_to_save + + # check that switching adapters and predicting does not raise + inputs = self.prepare_inputs_for_testing() + # "default" adapter is active + model(**inputs) + # switch to "other" adapter + model.set_adapter("other") + model(**inputs) + + def test_multiple_adapters_mixed_modules_to_save_merging_adapters(self): + # See issue 1574 + # This test is similar to test_multiple_adapters_mixed_modules_to_save, but it also checks that merging adapter + # weights works when one adapter has a modules_to_save and the other hasn't + config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + config1 = LoraConfig(target_modules=["lin0"]) + model = MLP() + model = get_peft_model(model, config0).to(self.torch_device) + model.add_adapter("other", config1) + + # check that this does not raise + model.add_weighted_adapter(["default", "other"], weights=[1.0, 1.0], adapter_name="merged") + + # since one of the adapters that was merged has a modules_to_save, that one should be used for the merged + # adapter + assert "default" in model.base_model.model.lin1.modules_to_save + assert "other" not in model.base_model.model.lin1.modules_to_save + assert "merged" in model.base_model.model.lin1.modules_to_save + + # check that using the merged adapter does not raise + model.set_adapter("merged") + inputs = self.prepare_inputs_for_testing() + model(**inputs) + + def test_multiple_adapters_same_modules_to_save_merging_adapters_raises(self): + # See issue 1574 + # This test is similar to test_multiple_adapters_mixed_modules_to_save_merging_adapters but here the two + # adapters target the same module with modules_to_save. In this case, trying to merge the adapter weights + # should raise an error. + config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + config1 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = MLP() + model = get_peft_model(model, config0).to(self.torch_device) + model.add_adapter("other", config1) + + msg = re.escape( + "Cannot add weighted adapters if they target the same module with modules_to_save, but found 1 such " + "instance(s)." + ) + with pytest.raises(ValueError, match=msg): + model.add_weighted_adapter(["default", "other"], weights=[1.0, 1.0], adapter_name="merged") + + def test_multiple_adapters_seq_cls_mixed_modules_to_save_merging_adapters(self): + # See issue 1574 + # This test is similar to test_multiple_adapters_mixed_modules_to_save_merging_adapters but uses a SEQ_CLS + # model like in test_multiple_adapters_automatic_modules_to_save. This should raise an error because the same + # module is implicitly targeted by modules_to_save twice. + config0 = LoraConfig(task_type=TaskType.SEQ_CLS) + config1 = LoraConfig(task_type=TaskType.SEQ_CLS) + model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") + model = get_peft_model(model, config0) + model.add_adapter("other", config1) + + msg = re.escape( + "Cannot add weighted adapters if they target the same module with modules_to_save, but found 1 such " + "instance(s)." + ) + with pytest.raises(ValueError, match=msg): + model.add_weighted_adapter(["default", "other"], weights=[1.0, 1.0], adapter_name="merged") + + @pytest.mark.parametrize( + "config_cls", [IA3Config, LoHaConfig, LoKrConfig, LoraConfig, HRAConfig, BoneConfig, MissConfig] + ) + def test_add_weighted_adapter_cat_with_rank_pattern(self, config_cls): + # Fixes a bug described in #2512, which resulted from the rank_pattern not being taken into account + config0 = LoraConfig(target_modules=["lin0", "lin1"], r=8, rank_pattern={"lin0": 2}) + config1 = LoraConfig(target_modules=["lin0", "lin1"], r=8, rank_pattern={"lin0": 16}) + model = MLP() + model = get_peft_model(model, config0).to(self.torch_device) + model.add_adapter("other", config1) + model.add_weighted_adapter( + ["default", "other"], weights=[1.0, 1.0], adapter_name="merged", combination_type="cat" + ) + + def test_add_weighted_adapter_negative_weight_negates_adapter(self): + # Test that weight=-1.0 properly negates an adapter + torch.manual_seed(42) + model = MLP() + config = LoraConfig(target_modules=["lin0"], init_lora_weights=False) + model = get_peft_model(model, config, adapter_name="adapter1") + + # Create merged adapter with weight=1.0 + model.add_weighted_adapter( + adapters=["adapter1"], + weights=[1.0], + adapter_name="merged_positive", + combination_type="linear", + ) + + # Create merged adapter with weight=-1.0 + model.add_weighted_adapter( + adapters=["adapter1"], + weights=[-1.0], + adapter_name="merged_negative", + combination_type="linear", + ) + + # Get the LoRA weights for comparison + for name, module in model.named_modules(): + if hasattr(module, "lora_A") and "merged_positive" in module.lora_A: + pos_A = module.lora_A["merged_positive"].weight.data + neg_A = module.lora_A["merged_negative"].weight.data + pos_B = module.lora_B["merged_positive"].weight.data + neg_B = module.lora_B["merged_negative"].weight.data + + # Check that negative adapter is negation of positive + # Since we apply sign to both A and B: sign * sqrt(|w|) + # For w=1: sqrt(1) = 1, for w=-1: -sqrt(1) = -1 + assert torch.allclose(neg_A, -pos_A, atol=1e-6), "A matrices should be negated" + assert torch.allclose(neg_B, -pos_B, atol=1e-6), "B matrices should be negated" + + def test_add_weighted_adapter_subtraction_with_negative_weights(self): + # Test that merging two identical adapters with weights [1.0, -1.0] results in approximately zero weights + model = MLP() + config = LoraConfig(target_modules=["lin0"], init_lora_weights=False) + + # Create two identical adapters by using the same seed + torch.manual_seed(42) + model = get_peft_model(model, config, adapter_name="adapter1") + + torch.manual_seed(42) + model.add_adapter("adapter2", config) + + # Merge with weights [1.0, -1.0] - should cancel out exactly + model.add_weighted_adapter( + adapters=["adapter1", "adapter2"], + weights=[1.0, -1.0], + adapter_name="cancelled", + combination_type="linear", + ) + + # Verify the merged adapter has weights of approximately 0 + for name, module in model.named_modules(): + if hasattr(module, "lora_A") and "cancelled" in module.lora_A: + cancelled_A = module.lora_A["cancelled"].weight.data + cancelled_B = module.lora_B["cancelled"].weight.data + + # The weights should be approximately zero (they cancel out) + assert torch.allclose(cancelled_A, torch.zeros_like(cancelled_A), atol=1e-5), ( + f"Cancelled A should be ~0, got max abs value {cancelled_A.abs().max()}" + ) + assert torch.allclose(cancelled_B, torch.zeros_like(cancelled_B), atol=1e-5), ( + f"Cancelled B should be ~0, got max abs value {cancelled_B.abs().max()}" + ) + + def test_add_weighted_adapter_negative_weight_with_different_scaling(self): + # Test negative weights with different scaling factors (lora_alpha) + # This edge case ensures negative weights work correctly with different scaling values + torch.manual_seed(42) + model = MLP() + + # Create two configs with different lora_alpha (different scaling factors) + config1 = LoraConfig( + r=8, + lora_alpha=16, # scaling = 16/8 = 2 + target_modules=["lin0"], + lora_dropout=0.0, + bias="none", + init_lora_weights=False, + ) + config2 = LoraConfig( + r=8, + lora_alpha=32, # scaling = 32/8 = 4 + target_modules=["lin0"], + lora_dropout=0.0, + bias="none", + init_lora_weights=False, + ) + + model = get_peft_model(model, config1, adapter_name="adapter1") + model.add_adapter("adapter2", config2) + + # Merge with negative weight - should handle different scalings correctly + model.add_weighted_adapter( + adapters=["adapter1", "adapter2"], + weights=[0.5, -0.3], + adapter_name="merged_diff_scaling", + combination_type="linear", + ) + + # Verify the merged adapter can run forward pass + model.set_adapter("merged_diff_scaling") + dummy_input = torch.randn(2, 10) + output = model(dummy_input) + assert output is not None + + def test_multiple_adapters_no_needless_copy_modules_to_save(self): + # See 2206 + # The problem was that we keep a "global" modules_to_save on the model which contains all possible + # modules_to_save for each adapter. When the first adapter targets embed_tokens with modules_to_save and the + # second adapter targets lm_head, then embed_tokens will create a copy of the original module for the second + # adapter, even though it's not needed. The copy still acts as expected but uses unnecessary memory. + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.torch_device) + config0 = LoraConfig(modules_to_save=["embed_tokens"]) + config1 = LoraConfig(modules_to_save=["lm_head"]) + model = get_peft_model(model, config0) + model.add_adapter("other", config1) + + lm_head_keys = list(model.base_model.model.lm_head.modules_to_save.keys()) + assert lm_head_keys == ["other"] + + embed_token_keys = list(model.base_model.model.model.decoder.embed_tokens.modules_to_save.keys()) + # before the fix, this would be: ['default', 'other'] + assert embed_token_keys == ["default"] + + def test_existing_model_card(self): + # ensure that if there is already a model card, it is not overwritten + model = MLP() + config = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(model, config) + + with tempfile.TemporaryDirectory() as tmp_dirname: + # create a model card + text = "---\nmeta: hello\n---\nThis is a model card\n" + with open(os.path.join(tmp_dirname, "README.md"), "w") as f: + f.write(text) + + model.save_pretrained(tmp_dirname) + with open(os.path.join(tmp_dirname, "README.md")) as f: + model_card = f.read() + + assert "library_name: peft" in model_card + assert "meta: hello" in model_card + assert "This is a model card" in model_card + + def test_non_existing_model_card(self): + # ensure that if there is already a model card, it is not overwritten + model = MLP() + config = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(model, config) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + with open(os.path.join(tmp_dirname, "README.md")) as f: + model_card = f.read() + + assert "library_name: peft" in model_card + # rough check that the model card is pre-filled + assert len(model_card) > 1000 + + @pytest.mark.parametrize("save_embedding_layers", ["auto", True, False]) + @pytest.mark.parametrize( + "peft_config", + [ + (LoraConfig(target_modules=["lin0", "embed_tokens"], init_lora_weights=False)), + (LoraConfig(target_modules=r"^embed_tokens", init_lora_weights=False)), + ], + ) + def test_save_pretrained_targeting_lora_to_embedding_layer(self, save_embedding_layers, tmp_path, peft_config): + model = ModelEmbWithEmbeddingUtils() + model = get_peft_model(model, peft_config) + + if save_embedding_layers == "auto": + # assert warning + msg_start = "Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`." + with pytest.warns(UserWarning, match=msg_start): + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + else: + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + + state_dict = safe_load_file(tmp_path / "adapter_model.safetensors") + contains_embedding = "base_model.model.embed_tokens.base_layer.weight" in state_dict + + if save_embedding_layers in ["auto", True]: + assert contains_embedding + assert torch.allclose( + model.base_model.model.embed_tokens.base_layer.weight, + state_dict["base_model.model.embed_tokens.base_layer.weight"], + ) + else: + assert not contains_embedding + + @pytest.mark.parametrize("save_embedding_layers", ["auto", True, False]) + @pytest.mark.parametrize( + "peft_config", + [ + (LoraConfig(target_modules=["lin0", "emb"], init_lora_weights=False)), + (LoraConfig(target_modules=r"^emb", init_lora_weights=False)), + ], + ) + def test_save_pretrained_targeting_lora_to_embedding_layer_non_transformers( + self, save_embedding_layers, tmp_path, peft_config + ): + model = ModelEmbConv1D() + model = get_peft_model(model, peft_config) + + if save_embedding_layers is True: + with pytest.warns( + UserWarning, + match=r"Could not identify embedding layer\(s\) because the model is not a 🤗 transformers model\.", + ): + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + else: + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + + state_dict = safe_load_file(tmp_path / "adapter_model.safetensors") + assert "base_model.model.emb.base_layer.weight" not in state_dict + + def test_load_resized_embedding_ignore_mismatched_sizes(self): + # issue #1605 + # Make it possible to load a LoRA layer that targets an embedding layer even if the sizes mismatch by passing + # ignore_mismatched_sizes=True + model = ModelEmbConv1D(emb_size=100) + config = LoraConfig(target_modules=["emb", "lin0"], init_lora_weights=False) + model = get_peft_model(model, config) + + # note: not using the context manager here because it fails on Windows CI for some reason + tmp_dirname = tempfile.mkdtemp() + try: + model.save_pretrained(tmp_dirname) + model = ModelEmbConv1D(emb_size=105) + + # first check that this raises + with pytest.raises(RuntimeError) as exc: + PeftModel.from_pretrained(model, tmp_dirname) + msg = exc.value.args[0] + assert "size mismatch" in msg and "100" in msg and "105" in msg + + # does not raise + PeftModel.from_pretrained(model, tmp_dirname, ignore_mismatched_sizes=True) + finally: + try: + shutil.rmtree(tmp_dirname) + except PermissionError: + # windows error + pass + + @pytest.mark.parametrize( + "config0", + [ + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoKrConfig(target_modules=["lin0"], init_weights=False), + LoHaConfig(target_modules=["lin0"], init_weights=False), + AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), + IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"], init_ia3_weights=False), + OFTConfig(target_modules=["lin0"], init_weights=False, r=2, oft_block_size=0), + BOFTConfig(target_modules=["lin0"], init_weights=False, boft_block_size=2), + HRAConfig(target_modules=["lin0"], init_weights=False), + BoneConfig(target_modules=["lin0"], init_weights=False, r=2), + MissConfig(target_modules=["lin0"], init_weights=False, r=2), + ], + ) + def test_adapter_name_makes_no_difference(self, config0): + # It should not matter whether we use the default adapter name or a custom one + model_cls = MLP + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + + # base model + torch.manual_seed(0) + base_model = model_cls().eval().to(self.torch_device) + output_base = base_model(input) + + # default name + torch.manual_seed(0) + base_model = model_cls().eval().to(self.torch_device) + torch.manual_seed(0) + peft_model_default = get_peft_model(base_model, config0, adapter_name="default").eval().to(self.torch_device) + output_default = peft_model_default(input) + sd_default = peft_model_default.state_dict() + + # custom name 1 + torch.manual_seed(0) + base_model = model_cls().eval().to(self.torch_device) + torch.manual_seed(0) + peft_model_custom1 = get_peft_model(base_model, config0, adapter_name="adapter").eval().to(self.torch_device) + output_custom1 = peft_model_custom1(input) + sd_custom1 = peft_model_custom1.state_dict() + + # custom name 2 + torch.manual_seed(0) + base_model = model_cls().eval().to(self.torch_device) + torch.manual_seed(0) + peft_model_custom2 = ( + get_peft_model(base_model, config0, adapter_name="other-name").eval().to(self.torch_device) + ) + output_custom2 = peft_model_custom2(input) + sd_custom2 = peft_model_custom2.state_dict() + + assert len(sd_default) == len(sd_custom1) == len(sd_custom2) + for key in sd_default: + key1 = key.replace("default", "adapter") + key2 = key.replace("default", "other-name") + assert key1 in sd_custom1 + assert key2 in sd_custom2 + for k0, k1, k2 in zip(sd_default, sd_custom1, sd_custom2): + assert torch.allclose(sd_default[k0], sd_custom1[k1]) + assert torch.allclose(sd_default[k0], sd_custom2[k2]) + + assert not torch.allclose(output_base, output_default) + assert not torch.allclose(output_base, output_custom1) + assert not torch.allclose(output_base, output_custom2) + assert torch.allclose(output_custom1, output_custom2) + assert torch.allclose(output_default, output_custom1) + + def test_gpt2_dora_merge_and_unload(self): + # see https://github.com/huggingface/peft/pull/1588#discussion_r1537914207 + model = AutoModelForCausalLM.from_pretrained("gpt2") + config = LoraConfig(task_type="CAUSAL_LM", use_dora=True) + model = get_peft_model(model, config) + # should not raise an error + model.merge_and_unload() + + def test_gpt2_dora_merge_and_unload_safe_merge(self): + # see https://github.com/huggingface/peft/pull/1588#discussion_r1537914207 + model = AutoModelForCausalLM.from_pretrained("gpt2") + config = LoraConfig(task_type="CAUSAL_LM", use_dora=True) + model = get_peft_model(model, config) + # should not raise an error + model.merge_and_unload(safe_merge=True) + + def test_unload_adapter_multihead_attention(self): + # MultiheadAttention has special logic for unloading, that logic is covered by this test + self._test_unload_adapter( + model_id="MHA", + config_cls=LoraConfig, + config_kwargs={"target_modules": ["mha"], "init_lora_weights": False}, + ) + + def test_dora_save_and_load_remapping(self): + # Here we test the refactor of DoRA which changed lora_magnitude_vector from a ParameterDict to a ModuleDict + # with a DoraLayer instance. The old parameter is now the "weight" attribute of that layer. Since we want the + # state_dict format not to change, we ensure that the ".weight" part of the key is removed. + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + config = LoraConfig(task_type="CAUSAL_LM", use_dora=True) + model = get_peft_model(model, config) + state_dict = model.state_dict() + + # sanity check: state dict contains "lora_magnitude_vector.default.weight" keys + assert any("lora_magnitude_vector.default.weight" in k for k in state_dict) + + # save the model, check the state dict + # note: not using the context manager here because it fails on Windows CI for some reason + tmp_dirname = tempfile.mkdtemp() + try: + model.save_pretrained(tmp_dirname) + state_dict_adapter = safe_load_file(os.path.join(tmp_dirname, "adapter_model.safetensors")) + # note that in the state dict, the "default" part of the key is removed + assert not any("lora_magnitude_vector.weight" in k for k in state_dict_adapter) + + del model + loaded = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained("facebook/opt-125m"), tmp_dirname) + finally: + try: + shutil.rmtree(tmp_dirname) + except PermissionError: + # windows error + pass + + state_dict_loaded = loaded.state_dict() + assert state_dict.keys() == state_dict_loaded.keys() + for k in state_dict: + assert torch.allclose(state_dict[k], state_dict_loaded[k]) + + @pytest.mark.parametrize("with_forward_call", [False, True]) + def test_mha_gradients_set_correctly(self, with_forward_call): + # check for this bug: https://github.com/huggingface/peft/issues/761#issuecomment-1893804738 + base_model = ModelMha() + config = LoraConfig(target_modules=["mha"]) + model = get_peft_model(base_model, config) + model = model.to(self.torch_device) + + if with_forward_call: + # after the merge-unmerge roundtrip happening in forward of lora MHA, the base weights should be set to + # requires_grad=False + inputs = self.prepare_inputs_for_testing() + model(**inputs) + + assert model.base_model.model.mha.base_layer.out_proj.base_layer.weight.requires_grad is False + assert model.base_model.model.mha.base_layer.in_proj_weight.requires_grad is False + + # _restore_weights used to ignore the gradient, this checks that it is indeed considered + model.base_model.model.mha._restore_weights() + assert model.base_model.model.mha.base_layer.out_proj.base_layer.weight.requires_grad is False + assert model.base_model.model.mha.base_layer.in_proj_weight.requires_grad is False + + model.base_model.model.mha.base_layer.out_proj.base_layer.weight.requires_grad = True + model.base_model.model.mha.base_layer.in_proj_weight.requires_grad = True + assert model.base_model.model.mha.base_layer.out_proj.base_layer.weight.requires_grad is True + assert model.base_model.model.mha.base_layer.in_proj_weight.requires_grad is True + + model.base_model.model.mha._restore_weights() + assert model.base_model.model.mha.base_layer.out_proj.base_layer.weight.requires_grad is True + assert model.base_model.model.mha.base_layer.in_proj_weight.requires_grad is True + + +class TestMultiRankAdapter: + """Tests related to multirank LoRA adapters""" + + def test_multirank(self): + config_1 = LoraConfig( + r=8, + lora_alpha=8, + init_lora_weights=False, + target_modules=["lin0", "lin1"], + ) + config_2 = LoraConfig( + r=8, + lora_alpha=8, + init_lora_weights=False, + target_modules=["lin0", "lin1"], + rank_pattern={"lin0": 4}, + alpha_pattern={"lin0": 4}, + ) + + # Add first adapter + model = get_peft_model(MLP(), config_1, adapter_name="first") + + # Add second adapter + model.add_adapter("second", config_2) + + # Extract current and expected ranks + rank_current = model.lin0.lora_A["second"].weight.shape[0] + rank_expected = config_2.rank_pattern["lin0"] + + assert rank_current == rank_expected, f"Rank {rank_current} is not equal to expected {rank_expected}" + + def test_multirank_2(self): + rank_pattern = {} + alpha_pattern = {} + r = 4 + lora_alpha = 8 + + for i in range(10): + rank = 64 // (i + 1) + for j in range(2): + rank_pattern[f"layers.{i}.lin{j}"] = rank + alpha_pattern[f"layers.{i}.lin{j}"] = 2 * rank + + config = LoraConfig( + r=r, + lora_alpha=lora_alpha, + init_lora_weights=False, + target_modules=["lin0", "lin1"], + rank_pattern=rank_pattern, + alpha_pattern=alpha_pattern, + ) + + # Add first adapter + model = get_peft_model(DeepMLP(), config, adapter_name="first") + + # Add second adapter + model.add_adapter("second", config) + + for adapter in ["first", "second"]: + for key, module in model.base_model.model.named_modules(): + if isinstance(module, BaseTunerLayer): + rank_expected = rank_pattern.get(key, r) + rank_current = module.lora_A[adapter].weight.shape[0] + assert rank_current == rank_expected, ( + f"Rank {rank_current} is not equal to expected {rank_expected}" + ) + + +class TestLayerRepr: + """Tests related to the repr of adapted models""" + + def test_repr_lora_linear(self): + config = LoraConfig(target_modules=["lin0"]) + model = get_peft_model(MLP(), config) + print_output = repr(model.model.lin0) + assert print_output.startswith("lora.Linear") + assert "in_features=10" in print_output + assert "out_features=20" in print_output + assert "lora_A" in print_output + assert "lora_B" in print_output + assert "default" in print_output + + def test_repr_lora_embedding(self): + config = LoraConfig(target_modules=["emb"]) + model = get_peft_model(ModelEmbConv1D(), config) + print_output = repr(model.model.emb) + assert print_output.startswith("lora.Embedding") + assert "100, 5" in print_output + assert "lora_embedding_A" in print_output + assert "lora_embedding_B" in print_output + assert "default" in print_output + + def test_repr_lora_conv1d(self): + config = LoraConfig(target_modules=["conv1d"]) + model = get_peft_model(ModelEmbConv1D(), config) + print_output = repr(model.model.conv1d) + assert print_output.startswith("lora.Linear") + assert "in_features=5" in print_output + assert "out_features=1" in print_output + assert "lora_A" in print_output + assert "lora_B" in print_output + assert "default" in print_output + + def test_repr_lora_conv2d(self): + config = LoraConfig(target_modules=["conv2d"]) + model = get_peft_model(ModelConv2D(), config) + print_output = repr(model.model.conv2d) + assert print_output.startswith("lora.Conv2d") + assert "5, 10" in print_output + assert "kernel_size=(3, 3)" in print_output + assert "stride=(1, 1)" in print_output + assert "lora_A" in print_output + assert "lora_B" in print_output + assert "default" in print_output + + def test_repr_lora_paramwrapper(self): + config = LoraConfig(target_parameters=["lin0.weight"]) + model = get_peft_model(MLP(), config) + print_output = repr(model.model.lin0) + assert print_output.startswith("lora.ParamWrapper") + # important: targeted parameter should be contained: + assert "parameter_name='weight'" in print_output + assert "in_features=10" in print_output + assert "out_features=20" in print_output + assert "lora_A" in print_output + assert "lora_B" in print_output + assert "default" in print_output + + +class TestMultipleActiveAdapters: + """ + A test class to test the functionality of multiple active adapters. + + This is not specifically tied to custom models, it's just easy to test here and testing it on all types of models + would be overkill. + """ + + torch_device = infer_device() + + def prepare_inputs_for_testing(self): + X = torch.arange(90).view(9, 10).to(self.torch_device) + return {"X": X} + + def set_multiple_active_adapters(self, model, adapter_names): + for module in model.modules(): + if isinstance(module, (BaseTunerLayer, AuxiliaryTrainingWrapper)): + module.set_adapter(adapter_names) + + def resolve_model_cls(self, tuner_method): + if tuner_method == "lora+trainable_tokens": + # for this method we need an Embedding layer to target + return ModelEmbConv1D() + if tuner_method == "ia3": + return MLP(bias=False) + return MLP(bias=True) + + @pytest.mark.parametrize( + "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES + ) + def test_multiple_active_adapters_forward( + self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 + ): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs_2) + + torch.manual_seed(0) + + model = self.resolve_model_cls(tuner_method) + model = model.to(self.torch_device).eval() + + X = self.prepare_inputs_for_testing() + + config_1 = config_cls(**config_kwargs_1) + config_2 = config_cls(**config_kwargs_2) + + peft_model = get_peft_model(model, config_1, adapter_name="adapter_1") + peft_model.add_adapter("adapter_2", config_2) + + # the assumption that the output of the combined output of two adapters is != to the output of one + # adapter is not true for unmodified trainable tokens as they just mimic the existing embedding matrix. + # therefore, we modify the weights so that the adapter weights differs from the embedding weights. + # + # We do it this way because we have no way to pass something like `init_weights=False` to the token adapter. + if "trainable_tokens" in tuner_method: + peft_model.emb.token_adapter.trainable_tokens_delta["adapter_1"].data = torch.rand_like( + peft_model.emb.token_adapter.trainable_tokens_delta["adapter_1"].data + ) + peft_model.emb.token_adapter.trainable_tokens_delta["adapter_2"].data = torch.rand_like( + peft_model.emb.token_adapter.trainable_tokens_delta["adapter_2"].data + ) + + # set adapter_1 + peft_model.set_adapter("adapter_1") + adapter_1_output = peft_model(**X) + + # set adapter_2 + peft_model.set_adapter("adapter_2") + adapter_2_output = peft_model(**X) + + # set ["adapter_1", "adapter_2"] + self.set_multiple_active_adapters(peft_model, ["adapter_1", "adapter_2"]) + combined_output = peft_model(**X) + + assert not torch.allclose(adapter_1_output, adapter_2_output, atol=1e-5) + assert not torch.allclose(adapter_1_output, combined_output, atol=1e-5) + assert not torch.allclose(adapter_2_output, combined_output, atol=1e-5) + + if (tuner_method == "lora") and not (config_1.target_parameters or config_2.target_parameters): + # Create a weighted adapter combining both adapters and check that its output is same as setting multiple + # active adapters. `target_parameters` is not supported. + peft_model.add_weighted_adapter( + ["adapter_1", "adapter_2"], [1.0, 1.0], "new_combined_adapter", combination_type="cat" + ) + peft_model.set_adapter("new_combined_adapter") + new_combined_output = peft_model(**X) + assert torch.allclose(new_combined_output, combined_output, atol=1e-5) + + @pytest.mark.parametrize( + "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES + ) + def test_multiple_active_adapters_merge_and_unmerge( + self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 + ): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs_2) + + torch.manual_seed(0) + + model = self.resolve_model_cls(tuner_method) + model = model.to(self.torch_device).eval() + + X = self.prepare_inputs_for_testing() + base_output = model(**X) + + config_1 = config_cls(**config_kwargs_1) + config_2 = config_cls(**config_kwargs_2) + + peft_model = get_peft_model(model, config_1, adapter_name="adapter_1") + peft_model.add_adapter("adapter_2", config_2) + + # set ["adapter_1", "adapter_2"] + self.set_multiple_active_adapters(peft_model, ["adapter_1", "adapter_2"]) + combined_output = peft_model(**X) + + peft_model.merge_adapter() + merged_combined_output = peft_model(**X) + assert torch.allclose(merged_combined_output, combined_output, atol=1e-4) + + peft_model.unmerge_adapter() + + with peft_model.disable_adapter(): + disabled_adapter_output = peft_model(**X) + + assert torch.allclose(disabled_adapter_output, base_output, atol=1e-4) + + @pytest.mark.parametrize( + "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES + ) + def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): + _skip_tests_with_multiple_adapters_with_target_parameters(config_cls, config_kwargs_2) + + torch.manual_seed(0) + + model = self.resolve_model_cls(tuner_method) + model = model.to(self.torch_device).eval() + + config_1 = config_cls(**config_kwargs_1) + config_2 = config_cls(**config_kwargs_2) + + model = get_peft_model(model, config_1) + + # the assumption that the output of the combined output of two adapters is != to the output of one + # adapter is not true for unmodified trainable tokens as they just mimic the existing embedding matrix. + # therefore, we modify the weights so that the adapter weights differs from the embedding weights. in this + # case we even use 20*rand to be very distinct to adapter 2 since we're comparing outputs and not embeddings + # with rather high tolerance values. this is also the reason why `init_weights` is not sufficient here and + # when using `.trainable_token_indices` we do not have the utility of `init_weights` anyway. + if "trainable_tokens" in tuner_method: + model.emb.token_adapter.trainable_tokens_delta["default"].data = 20 * torch.rand_like( + model.emb.token_adapter.trainable_tokens_delta["default"].data + ) + + dummy_input = self.prepare_inputs_for_testing() + model.eval() + + with torch.inference_mode(): + logits_adapter_1 = model(**dummy_input)[0] + + model.add_adapter("adapter-2", config_2) + model.set_adapter("adapter-2") + + # same as above but for adapter 2 + if "trainable_tokens" in tuner_method: + model.emb.token_adapter.trainable_tokens_delta["adapter-2"].data = 2 * torch.rand_like( + model.emb.token_adapter.trainable_tokens_delta["adapter-2"].data + ) + + model.eval() + + with torch.inference_mode(): + logits_adapter_2 = model(**dummy_input)[0] + + assert not torch.allclose(logits_adapter_1, logits_adapter_2, atol=1e-3, rtol=1e-3) + + model.set_adapter("default") + + with torch.inference_mode(): + logits_adapter_1_after_set = model(**dummy_input)[0] + + assert torch.allclose(logits_adapter_1_after_set, logits_adapter_1, atol=1e-3, rtol=1e-3) + + model_copy = copy.deepcopy(model) + model_copy_2 = copy.deepcopy(model) + model_merged_all = model.merge_and_unload(adapter_names=["adapter-2", "default"]) + + with torch.inference_mode(): + logits_merged_all = model_merged_all(**dummy_input)[0] + + assert not torch.allclose(logits_merged_all, logits_adapter_2, atol=1e-3, rtol=1e-3) + assert not torch.allclose(logits_merged_all, logits_adapter_1, atol=1e-3, rtol=1e-3) + + model_merged_adapter_2 = model_copy.merge_and_unload(adapter_names=["adapter-2"]) + + with torch.inference_mode(): + logits_merged_adapter_2 = model_merged_adapter_2(**dummy_input)[0] + + assert torch.allclose(logits_merged_adapter_2, logits_adapter_2, atol=1e-3, rtol=1e-3) + + model_merged_adapter_default = model_copy_2.merge_and_unload(adapter_names=["default"]) + + with torch.inference_mode(): + logits_merged_adapter_default = model_merged_adapter_default(**dummy_input)[0] + + assert torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3) + + +class TestRequiresGrad: + """Test that requires_grad is set correctly in specific circumstances + + # See issue #899. + + This is not specifically tied to custom models, it's just easy to test here and testing it on all types of models + would be overkill. + + """ + + def check_requires_grad(self, model, *params_expected: str): + # Check that only the given parameters have requires_grad=True, and all others have requires_grad=False. + # Calling without arguments besides the model means that all parameters should have requires_grad=False. + params_with_requires_grad = [name for name, param in model.named_parameters() if param.requires_grad] + diff = set(params_expected).symmetric_difference(set(params_with_requires_grad)) + msg = f"Expected {params_expected} to require gradients, got {params_with_requires_grad}" + assert len(diff) == 0, msg + + def test_requires_grad_modules_to_save_default(self): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model = get_peft_model(MLP(), config) + + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.default.weight", + "base_model.model.lin1.modules_to_save.default.bias", + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + def test_requires_grad_modules_to_save_disabling(self): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model = get_peft_model(MLP(), config) + + # when disabling the adapter, the original module's grad should be enabled and vice versa + peft_model.disable_adapter_layers() + self.check_requires_grad( + peft_model, + "base_model.model.lin1.original_module.weight", + "base_model.model.lin1.original_module.bias", + ) + + # when re-enabling the adapter, the original module's grad should be disabled and vice versa + peft_model.enable_adapter_layers() + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.default.weight", + "base_model.model.lin1.modules_to_save.default.bias", + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # when using the disable_adapter context, the original module's grad should be enabled and vice versa + with peft_model.disable_adapter(): + self.check_requires_grad( + peft_model, + "base_model.model.lin1.original_module.weight", + "base_model.model.lin1.original_module.bias", + ) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.default.weight", + "base_model.model.lin1.modules_to_save.default.bias", + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + def test_requires_grad_modules_to_save_multiple_adapters(self): + config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.default.weight", + "base_model.model.lin1.modules_to_save.default.bias", + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.default.weight", + "base_model.model.lin1.modules_to_save.default.bias", + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # set config1 as active, should lead to adapter1 requiring grad + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.modules_to_save.adapter1.weight", + "base_model.model.lin1.modules_to_save.adapter1.bias", + "base_model.model.lin0.lora_A.adapter1.weight", + "base_model.model.lin0.lora_B.adapter1.weight", + ) + + def test_requires_grad_lora_different_targets(self): + # test two different LoRA adapters that target different modules + config0 = LoraConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoraConfig(target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lora_A.adapter1.weight", + "base_model.model.lin1.lora_B.adapter1.weight", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lora_A.adapter1.weight", + "base_model.model.lin1.lora_B.adapter1.weight", + ) + + def test_requires_grad_lora_same_targets(self): + # same as previous test, except that LoRA adapters target the same layer + config0 = LoraConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoraConfig(target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default.weight", + "base_model.model.lin0.lora_B.default.weight", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1.weight", + "base_model.model.lin0.lora_B.adapter1.weight", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1.weight", + "base_model.model.lin0.lora_B.adapter1.weight", + ) + + def test_requires_grad_ia3_different_targets(self): + # test two different IA3 adapters that target different modules + config0 = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = IA3Config(target_modules=["lin1"], feedforward_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.ia3_l.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.ia3_l.adapter1", + ) + + def test_requires_grad_ia3_same_targets(self): + # same as previous test, except that IA3 adapters target the same layer + config0 = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + @pytest.mark.xfail(strict=True) + def test_requires_grad_adalora_different_targets(self): + # test two different AdaLora adapters that target different modules + + # Note: This test is expected to fail because first loading one adapter, then the next adapter with + # inference_mode=True incorrectly leads to the requires_grad of the first adapter being turned to False. This is + # of course not desired but has yet to be fixed. In practice, it's unlikely that a user would pass + # inference_mode=True for add_adapter, this flag is mostly being used when calling PeftModel.from_pretrained, so + # we accept this issue for now. Note that only for AdaLoRA do we even need to pass inference_mode=True here, + # other PEFT methods don't require this. + config0 = AdaLoraConfig(target_modules=["lin0"], total_step=1) + peft_model = get_peft_model(MLP(), config0) + + # note: AdaLoRA cannot have more than 1 trainable active adapter, hence enable inference_mode + config1 = AdaLoraConfig(target_modules=["lin1"], total_step=1, inference_mode=True) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default", + "base_model.model.lin0.lora_B.default", + "base_model.model.lin0.lora_E.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default", + "base_model.model.lin0.lora_B.default", + "base_model.model.lin0.lora_E.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lora_A.adapter1", + "base_model.model.lin1.lora_B.adapter1", + "base_model.model.lin1.lora_E.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lora_A.adapter1", + "base_model.model.lin1.lora_B.adapter1", + "base_model.model.lin1.lora_E.adapter1", + ) + + @pytest.mark.xfail(strict=True) + def test_requires_grad_adalora_same_targets(self): + # same as previous test, except that AdaLora adapters target the same layer + + # Note: This test is expected to fail because first loading one adapter, then the next adapter with + # inference_mode=True incorrectly leads to the requires_grad of the first adapter being turned to False. This is + # of course not desired but has yet to be fixed. In practice, it's unlikely that a user would pass + # inference_mode=True for add_adapter, this flag is mostly being used when calling PeftModel.from_pretrained, so + # we accept this issue for now. Note that only for AdaLoRA do we even need to pass inference_mode=True here, + # other PEFT methods don't require this. + config0 = AdaLoraConfig(target_modules=["lin0"], total_step=1) + peft_model = get_peft_model(MLP(), config0) + + # note: AdaLoRA cannot have more than 1 trainable active adapter, hence enable inference_mode + config1 = AdaLoraConfig(target_modules=["lin0"], total_step=1, inference_mode=True) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default", + "base_model.model.lin0.lora_B.default", + "base_model.model.lin0.lora_E.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.default", + "base_model.model.lin0.lora_B.default", + "base_model.model.lin0.lora_E.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1", + "base_model.model.lin0.lora_B.adapter1", + "base_model.model.lin0.lora_E.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1", + "base_model.model.lin0.lora_B.adapter1", + "base_model.model.lin0.lora_E.adapter1", + ) + + def test_requires_grad_lora_conv2d(self): + # test two different LoRA adapters that target different modules + config0 = LoraConfig(target_modules=["conv2d"]) + peft_model = get_peft_model(ModelConv2D(), config0) + + config1 = LoraConfig(target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.conv2d.lora_A.default.weight", + "base_model.model.conv2d.lora_B.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.conv2d.lora_A.default.weight", + "base_model.model.conv2d.lora_B.default.weight", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1.weight", + "base_model.model.lin0.lora_B.adapter1.weight", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lora_A.adapter1.weight", + "base_model.model.lin0.lora_B.adapter1.weight", + ) + + def test_requires_grad_lora_emb_conv1d(self): + # test two different LoRA adapters that target different modules + config0 = LoraConfig(target_modules=["conv1d"]) + peft_model = get_peft_model(ModelEmbConv1D(), config0) + + config1 = LoraConfig(target_modules=["emb"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.conv1d.lora_A.default.weight", + "base_model.model.conv1d.lora_B.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.conv1d.lora_A.default.weight", + "base_model.model.conv1d.lora_B.default.weight", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.emb.lora_embedding_A.adapter1", + "base_model.model.emb.lora_embedding_B.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.emb.lora_embedding_A.adapter1", + "base_model.model.emb.lora_embedding_B.adapter1", + ) + + def test_requires_grad_ia3_conv1d(self): + # test two different LoRA adapters that target different modules + config0 = IA3Config(target_modules=["conv1d"], feedforward_modules=[]) + peft_model = get_peft_model(ModelEmbConv1D(), config0) + + config1 = IA3Config(target_modules=["lin0"], feedforward_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.conv1d.ia3_l.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.conv1d.ia3_l.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + def test_requires_grad_ia3_conv2d(self): + # test two different LoRA adapters that target different modules + config0 = IA3Config(target_modules=["conv2d"], feedforward_modules=["conv2d"]) + peft_model = get_peft_model(ModelConv2D(), config0) + + config1 = IA3Config(target_modules=["lin0"], feedforward_modules=[]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.conv2d.ia3_l.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.conv2d.ia3_l.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.ia3_l.adapter1", + ) + + def test_requires_grad_loha_different_targets(self): + # test two different LoHa adapters that target different modules + config0 = LoHaConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoHaConfig(target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.default", + "base_model.model.lin0.hada_w1_b.default", + "base_model.model.lin0.hada_w2_a.default", + "base_model.model.lin0.hada_w2_b.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.default", + "base_model.model.lin0.hada_w1_b.default", + "base_model.model.lin0.hada_w2_a.default", + "base_model.model.lin0.hada_w2_b.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.hada_w1_a.adapter1", + "base_model.model.lin1.hada_w1_b.adapter1", + "base_model.model.lin1.hada_w2_a.adapter1", + "base_model.model.lin1.hada_w2_b.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.hada_w1_a.adapter1", + "base_model.model.lin1.hada_w1_b.adapter1", + "base_model.model.lin1.hada_w2_a.adapter1", + "base_model.model.lin1.hada_w2_b.adapter1", + ) + + def test_requires_grad_loha_same_targets(self): + # same as previous test, except that LoHa adapters target the same layer + config0 = LoHaConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoHaConfig(target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.default", + "base_model.model.lin0.hada_w1_b.default", + "base_model.model.lin0.hada_w2_a.default", + "base_model.model.lin0.hada_w2_b.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.default", + "base_model.model.lin0.hada_w1_b.default", + "base_model.model.lin0.hada_w2_a.default", + "base_model.model.lin0.hada_w2_b.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.adapter1", + "base_model.model.lin0.hada_w1_b.adapter1", + "base_model.model.lin0.hada_w2_a.adapter1", + "base_model.model.lin0.hada_w2_b.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hada_w1_a.adapter1", + "base_model.model.lin0.hada_w1_b.adapter1", + "base_model.model.lin0.hada_w2_a.adapter1", + "base_model.model.lin0.hada_w2_b.adapter1", + ) + + def test_requires_grad_lokr_different_targets(self): + # test two different LoKr adapters that target different modules + config0 = LoKrConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoKrConfig(target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.default", + "base_model.model.lin0.lokr_w2.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.default", + "base_model.model.lin0.lokr_w2.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lokr_w1.adapter1", + "base_model.model.lin1.lokr_w2.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.lokr_w1.adapter1", + "base_model.model.lin1.lokr_w2.adapter1", + ) + + def test_requires_grad_lokr_same_targets(self): + # same as previous test, except that LoKr adapters target the same layer + config0 = LoKrConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = LoKrConfig(target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.default", + "base_model.model.lin0.lokr_w2.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.default", + "base_model.model.lin0.lokr_w2.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.adapter1", + "base_model.model.lin0.lokr_w2.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.lokr_w1.adapter1", + "base_model.model.lin0.lokr_w2.adapter1", + ) + + def test_requires_grad_oft_different_targets(self): + # test two different OFT adapters that target different modules + config0 = OFTConfig(target_modules=["lin0"], r=2, oft_block_size=0) + peft_model = get_peft_model(MLP(), config0) + + config1 = OFTConfig(target_modules=["lin1"], r=2, oft_block_size=0) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.default.weight", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.oft_R.adapter1.weight", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.oft_R.adapter1.weight", + ) + + def test_requires_grad_oft_same_targets(self): + # same as previous test, except that OFT adapters target the same layer + config0 = OFTConfig(target_modules=["lin0"], r=2, oft_block_size=0) + peft_model = get_peft_model(MLP(), config0) + + config1 = OFTConfig(target_modules=["lin0"], r=2, oft_block_size=0) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.default.weight", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.default.weight", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.adapter1.weight", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.oft_R.adapter1.weight", + ) + + def test_requires_grad_hra_different_targets(self): + # test two different HRA adapters that target different modules + config0 = HRAConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = HRAConfig(target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.hra_u.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.hra_u.adapter1", + ) + + def test_requires_grad_hra_same_targets(self): + # same as previous test, except that HRA adapters target the same layer + config0 = HRAConfig(target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = HRAConfig(target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.hra_u.adapter1", + ) + + def test_requires_grad_bone_different_targets(self): + # test two different HRA adapters that target different modules + config0 = BoneConfig(target_modules=["lin0"], r=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = BoneConfig(target_modules=["lin1"], r=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.bone_block.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.bone_block.adapter1", + ) + + def test_requires_grad_bone_same_targets(self): + # same as previous test, except that HRA adapters target the same layer + config0 = BoneConfig(target_modules=["lin0"], r=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = BoneConfig(target_modules=["lin0"], r=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.bone_block.adapter1", + ) + + def test_requires_grad_miss_different_targets(self): + # test two different HRA adapters that target different modules + config0 = MissConfig(target_modules=["lin0"], r=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = MissConfig(target_modules=["lin1"], r=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.miss_block.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.miss_block.adapter1", + ) + + def test_requires_grad_miss_same_targets(self): + # same as previous test, except that HRA adapters target the same layer + config0 = MissConfig(target_modules=["lin0"], r=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = MissConfig(target_modules=["lin0"], r=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.miss_block.adapter1", + ) + + def test_requires_grad_boft_different_targets(self): + # test two different OFT adapters that target different modules + config0 = BOFTConfig(target_modules=["lin0"], boft_block_size=2) + peft_model = get_peft_model(MLP2(), config0) + + config1 = BOFTConfig(target_modules=["lin1"], boft_block_size=2) + peft_model.add_adapter("adapter1", config1) + + # active pter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.boft_R.default", + "base_model.model.lin0.boft_s.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.boft_R.default", + "base_model.model.lin0.boft_s.default", + ) + + # change activate pter to pter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.adapter1", + "base_model.model.lin1.boft_s.adapter1", + ) + + # disable all pters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.adapter1", + "base_model.model.lin1.boft_s.adapter1", + ) + + def test_requires_grad_boft_same_targets(self): + # same as previous test, except that BOFT adapters target the same layer + config0 = BOFTConfig(target_modules=["lin1"], boft_block_size=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = BOFTConfig(target_modules=["lin1"], boft_block_size=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.default", + "base_model.model.lin1.boft_s.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.default", + "base_model.model.lin1.boft_s.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.adapter1", + "base_model.model.lin1.boft_s.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.boft_R.adapter1", + "base_model.model.lin1.boft_s.adapter1", + ) + + def test_requires_grad_lntuning_different_targets(self): + config0 = LNTuningConfig( + target_modules=["layernorm0"], + ) + peft_model = get_peft_model(MLP_LayerNorm(), config0) + + config1 = LNTuningConfig(target_modules=["layernorm1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.default.weight", + "base_model.model.layernorm0.ln_tuning_layers.default.bias", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.default.weight", + "base_model.model.layernorm0.ln_tuning_layers.default.bias", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm1.ln_tuning_layers.adapter1.weight", + "base_model.model.layernorm1.ln_tuning_layers.adapter1.bias", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm1.ln_tuning_layers.adapter1.weight", + "base_model.model.layernorm1.ln_tuning_layers.adapter1.bias", + ) + + def test_requires_grad_lntuning_same_targets(self): + config0 = LNTuningConfig( + target_modules=["layernorm0"], + ) + peft_model = get_peft_model(MLP_LayerNorm(), config0) + + config1 = LNTuningConfig(target_modules=["layernorm0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.default.weight", + "base_model.model.layernorm0.ln_tuning_layers.default.bias", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.default.weight", + "base_model.model.layernorm0.ln_tuning_layers.default.bias", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.adapter1.weight", + "base_model.model.layernorm0.ln_tuning_layers.adapter1.bias", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.layernorm0.ln_tuning_layers.adapter1.weight", + "base_model.model.layernorm0.ln_tuning_layers.adapter1.bias", + ) + + def test_requires_grad_vera_different_targets(self): + # Test two different VeRA adapters that target different modules. Most notably, ensure that vera_A and vera_B + # don't require grads. + + # requires a model with at least 2 layers with the same shapes + class MLP2(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + config0 = VeraConfig(target_modules=["lin1"]) + peft_model = get_peft_model(MLP2(), config0) + + config1 = VeraConfig(target_modules=["lin2"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.default", + "base_model.model.lin1.vera_lambda_d.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.default", + "base_model.model.lin1.vera_lambda_d.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin2.vera_lambda_b.adapter1", + "base_model.model.lin2.vera_lambda_d.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin2.vera_lambda_b.adapter1", + "base_model.model.lin2.vera_lambda_d.adapter1", + ) + + def test_requires_grad_vera_same_targets(self): + # Test two different VeRA adapters that target the same module. Most notably, ensure that vera_A and vera_B + # don't require grads. + + # requires a model with at least 2 layers with the same shapes + class MLP2(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + config0 = VeraConfig(target_modules=["lin1", "lin2"]) + peft_model = get_peft_model(MLP2(), config0) + + config1 = VeraConfig(target_modules=["lin1", "lin2"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.default", + "base_model.model.lin1.vera_lambda_d.default", + "base_model.model.lin2.vera_lambda_b.default", + "base_model.model.lin2.vera_lambda_d.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.default", + "base_model.model.lin1.vera_lambda_d.default", + "base_model.model.lin2.vera_lambda_b.default", + "base_model.model.lin2.vera_lambda_d.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.adapter1", + "base_model.model.lin1.vera_lambda_d.adapter1", + "base_model.model.lin2.vera_lambda_b.adapter1", + "base_model.model.lin2.vera_lambda_d.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vera_lambda_b.adapter1", + "base_model.model.lin1.vera_lambda_d.adapter1", + "base_model.model.lin2.vera_lambda_b.adapter1", + "base_model.model.lin2.vera_lambda_d.adapter1", + ) + + def test_requires_grad_randlora_different_targets(self): + # Test two different RandLora adapters that target different modules. Most notably, ensure that randbasis_A and randbasis_B + # don't require grads. + + # requires a model with at least 2 layers with the same shapes + class MLP2(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + config0 = RandLoraConfig(target_modules=["lin1"]) + peft_model = get_peft_model(MLP2(), config0) + + config1 = RandLoraConfig(target_modules=["lin2"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.default", + "base_model.model.lin1.randlora_gamma.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.default", + "base_model.model.lin1.randlora_gamma.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin2.randlora_lambda.adapter1", + "base_model.model.lin2.randlora_gamma.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin2.randlora_lambda.adapter1", + "base_model.model.lin2.randlora_gamma.adapter1", + ) + + def test_requires_grad_randlora_same_targets(self): + # Test two different RandLora adapters that target the same module. Most notably, ensure that randbasis_A and randbasis_B + # don't require grads. + + # requires a model with at least 2 layers with the same shapes + class MLP2(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + config0 = RandLoraConfig(target_modules=["lin1", "lin2"]) + peft_model = get_peft_model(MLP2(), config0) + + config1 = RandLoraConfig(target_modules=["lin1", "lin2"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.default", + "base_model.model.lin1.randlora_gamma.default", + "base_model.model.lin2.randlora_lambda.default", + "base_model.model.lin2.randlora_gamma.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.default", + "base_model.model.lin1.randlora_gamma.default", + "base_model.model.lin2.randlora_lambda.default", + "base_model.model.lin2.randlora_gamma.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.adapter1", + "base_model.model.lin1.randlora_gamma.adapter1", + "base_model.model.lin2.randlora_lambda.adapter1", + "base_model.model.lin2.randlora_gamma.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.randlora_lambda.adapter1", + "base_model.model.lin1.randlora_gamma.adapter1", + "base_model.model.lin2.randlora_lambda.adapter1", + "base_model.model.lin2.randlora_gamma.adapter1", + ) + + def test_requires_grad_vblora_different_targets(self): + # test two different VBLoRA adapters that target different modules + config0 = VBLoRAConfig(target_modules=["lin0"], vector_length=1, num_vectors=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = VBLoRAConfig(target_modules=["lin1"], vector_length=1, num_vectors=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.default", + "base_model.model.lin0.vblora_logits_B.default", + "base_model.model.lin0.vblora_vector_bank.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.default", + "base_model.model.lin0.vblora_logits_B.default", + "base_model.model.lin0.vblora_vector_bank.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vblora_logits_A.adapter1", + "base_model.model.lin1.vblora_logits_B.adapter1", + "base_model.model.lin0.vblora_vector_bank.adapter1", # vblora_vector_bank is shared + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.vblora_logits_A.adapter1", + "base_model.model.lin1.vblora_logits_B.adapter1", + "base_model.model.lin0.vblora_vector_bank.adapter1", # vblora_vector_bank is shared + ) + + def test_requires_grad_vblora_same_targets(self): + # same as previous test, except that VBLoRA adapters target the same layer + config0 = VBLoRAConfig(target_modules=["lin0"], vector_length=1, num_vectors=2) + peft_model = get_peft_model(MLP(), config0) + + config1 = VBLoRAConfig(target_modules=["lin0"], vector_length=1, num_vectors=2) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.default", + "base_model.model.lin0.vblora_logits_B.default", + "base_model.model.lin0.vblora_vector_bank.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.default", + "base_model.model.lin0.vblora_logits_B.default", + "base_model.model.lin0.vblora_vector_bank.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.adapter1", + "base_model.model.lin0.vblora_logits_B.adapter1", + "base_model.model.lin0.vblora_vector_bank.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin0.vblora_logits_A.adapter1", + "base_model.model.lin0.vblora_logits_B.adapter1", + "base_model.model.lin0.vblora_vector_bank.adapter1", + ) + + def test_requires_grad_fourierft_different_targets(self): + # test two different fourierft adapters that target different modules + config0 = FourierFTConfig(n_frequency=10, target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = FourierFTConfig(n_frequency=10, target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin1.fourierft_spectrum.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + self.check_requires_grad( + peft_model, + "base_model.model.lin1.fourierft_spectrum.adapter1", + ) + + def test_requires_grad_fourierft_same_targets(self): + # same as previous test, except that AdaLora adapters target the same layer + config0 = FourierFTConfig(n_frequency=10, target_modules=["lin0"]) + peft_model = get_peft_model(MLP(), config0) + + config1 = FourierFTConfig(n_frequency=10, target_modules=["lin0"]) + peft_model.add_adapter("adapter1", config1) + + # active adapter is still "default" + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.default", + ) + + # set config0 as active, should not change anything + peft_model.set_adapter("default") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.default", + ) + + # change activate adapter to adapter1 + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.adapter1", + ) + + # disable all adapters + with peft_model.disable_adapter(): + self.check_requires_grad(peft_model) + + # after context is exited, return to the previous state + peft_model.set_adapter("adapter1") + self.check_requires_grad( + peft_model, + "base_model.model.lin0.fourierft_spectrum.adapter1", + ) + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + @pytest.mark.parametrize("is_trainable", [False, True]) # note: default is False + def test_loading_model_requires_grad_set_correctly(self, config_cls, is_trainable, tmp_path): + # Test that when loading PeftModel and then loading another adapter, the requires_grad is set correctly and + # is_trainable is respected. + # See #2759 + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + config = config_cls(target_modules=["layers.0.lin0"], **extra_kwargs) + + if config_cls == TrainableTokensConfig: # TrainbleTokens requires a different base model and config + model = ModelEmbConv1D() + config = config_cls(target_modules=["emb"], token_indices=[0, 2, 4]) + + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + del model + + model = DeepMLP(size=256) + if config_cls == TrainableTokensConfig: # TrainbleTokens requires a different base + model = ModelEmbConv1D() + model = PeftModel.from_pretrained(model, tmp_path, is_trainable=is_trainable) + + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + # load one more adapter; this adapter is not automatically activated + model.load_adapter(tmp_path, adapter_name="other", is_trainable=is_trainable) + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + @pytest.mark.parametrize("config_cls", ALL_PEFT_CONFIG_CLASSES) + @pytest.mark.parametrize("is_trainable", [False, True]) # note: default is False + def test_loading_model_with_modules_to_save_requires_grad_set_correctly(self, config_cls, is_trainable, tmp_path): + # Same test as above, but with modules_to_save + if config_cls == TrainableTokensConfig: + pytest.skip(reason="Trainable tokens does not support modules_to_save") + + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + if config_cls == IA3Config: + extra_kwargs["feedforward_modules"] = [] + # targeting the different modules with modules_to_save: + config = config_cls(target_modules=["layers.0.lin0"], modules_to_save=["layers.0.lin1"], **extra_kwargs) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + del model + + model = DeepMLP(size=256) + model = PeftModel.from_pretrained(model, tmp_path, is_trainable=is_trainable) + + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + # load one more adapter + model.load_adapter(tmp_path, adapter_name="other", is_trainable=is_trainable) + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + @pytest.mark.parametrize("is_trainable", [False, True]) # note: default is False + def test_loading_model_with_trainble_tokens_requires_grad_set_correctly(self, is_trainable, tmp_path): + model = ModelEmbConv1D() + # targeting the same modules with modules_to_save: + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0]}) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + del model + + model = ModelEmbConv1D() + model = PeftModel.from_pretrained(model, tmp_path, is_trainable=is_trainable) + + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + # load one more adapter + model.load_adapter(tmp_path, adapter_name="other", is_trainable=is_trainable) + if is_trainable: + for name, param in model.named_parameters(): + if ".default" in name: + assert param.requires_grad + else: + assert not param.requires_grad + else: + assert all(not p.requires_grad for p in model.parameters()) + + @pytest.mark.xfail(strict=True) + @pytest.mark.parametrize("config_cls", [LoraConfig]) # no need to check each method, they all fail + def test_loading_model_requires_grad_set_correctly_switch_inference_mode(self, config_cls, tmp_path): + # Same as test_loading_model_requires_grad_set_correctly but this time we first load with is_trainable=False and + # then with is_trainable=True. Loading the second adapter should not affect the requires_grad of the first + # adapter, but it does. The reason is that is_training/inference_mode is taken from the current PEFT config, but + # that config does not necessarily belong to the active adapter, creating a mismatch. + # When/If this is fixed, the check can be integrated into test_loading_model_requires_grad_set_correctly and + # this test can be deleted. + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + config = config_cls(target_modules=["layers.0.lin0"]) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + del model + + model = DeepMLP(size=256) + model = PeftModel.from_pretrained(model, tmp_path, is_trainable=False) + assert all(not p.requires_grad for p in model.parameters()) + + # load one more adapter; this adapter is not automatically activated + model.load_adapter(tmp_path, adapter_name="other", is_trainable=True) + params_with_grad = [n for n, p in model.named_parameters() if p.requires_grad] + expected = [ + "base_model.model.layers.0.lin0.lora_A.other.weight", + "base_model.model.layers.0.lin0.lora_B.other.weight", + ] + # this fails, instead with get ...lora_A.default.weight and ...lora_B.default.weight + assert params_with_grad == expected + + @pytest.mark.xfail(strict=True) + @pytest.mark.parametrize("config_cls", [LoraConfig]) # no need to check each method, they all fail + def test_loading_model_requires_grad_load_adapter_then_add_adapter(self, config_cls, tmp_path): + # When adding a new adapter with model.add_adapter, through the set_adapter call in update_layer, we activate + # the gradients of the first adapter, even if it's not desired. Since there is no is_trainable argument on + # add_adapter, there is no way to disable that at the moment. + # When/If this is fixed, the check can be integrated into test_loading_model_requires_grad_set_correctly and + # this test can be deleted. + model = DeepMLP(size=256) # a size that works with all adapters + extra_kwargs = {} + config = config_cls(target_modules=["layers.0.lin0"]) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + del model + + model = DeepMLP(size=256) + model = PeftModel.from_pretrained(model, tmp_path, is_trainable=False) + assert all(not p.requires_grad for p in model.parameters()) + + # add a new adapter + model.add_adapter(adapter_name="other", peft_config=config) + params_with_grad = [n for n, p in model.named_parameters() if p.requires_grad] + assert all(not p.requires_grad for p in model.parameters()) + + +# this is for PEFT methods that support mixed adapter batches. +MIXED_ADAPTER_TEST_CASES = [ + ( + "LoRA mixed adapter", + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoraConfig(target_modules=["lin0"], r=16, init_lora_weights=False), + ), + ( + "RoAd mixed adapter", + RoadConfig(target_modules=["lin0"], group_size=2, init_weights=False), + RoadConfig(target_modules=["lin0"], group_size=2, variant="road_2", init_weights=False), + ), +] + + +class TestMixedAdapterBatches: + torch_device = infer_device() + + def get_mlp_peft(self, config0, config1): + """A simple MLP with 2 LoRA adapters""" + torch.manual_seed(0) + + base_model = MLP().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + return peft_model + + def run_checks(self, model, inputs): + # This checks that we can have mixed adapters in a single batch. The test works by creating the outputs for the + # base model, adapter 0, and adapter 1 separately. Then, we create an output with mixed adapters, where the + # sample [0, 3, 6] are for the base model, [1, 4, 7] for adapter 0, and [2, 5, 8] for adapter 1. Finally, we + # check that the outputs of the mixed batch are correct for the corresponding indices. + adapter_name0, adapter_name1 = model.peft_config.keys() + + with model.disable_adapter(): + output_base = model(**inputs) + + model.set_adapter(adapter_name0) + output0 = model(**inputs) + + # sanity check, outputs are not the same + assert not torch.allclose(output_base, output0) + + model.set_adapter(adapter_name1) + output1 = model(**inputs) + + # sanity check, outputs have the right shape and are not the same + assert len(output_base) >= 3 + assert len(output_base) == len(output0) == len(output1) + assert not torch.allclose(output_base, output0) + assert not torch.allclose(output_base, output1) + + # set adapter_indices so that it alternates between base, adapter 0, and adapter 1 + adapters = ["__base__", adapter_name0, adapter_name1] + inputs["adapter_names"] = [adapters[i % 3] for i in (range(len(inputs["X"])))] + output_mixed = model.forward(**inputs) + + assert torch.allclose(output_base[::3], output_mixed[::3]) + assert torch.allclose(output0[1::3], output_mixed[1::3]) + assert torch.allclose(output1[2::3], output_mixed[2::3]) + + @pytest.mark.parametrize("test_name, config0, config1", MIXED_ADAPTER_TEST_CASES) + def test_mixed_adapter_batches_mlp(self, test_name, config0, config1): + mlp_peft = self.get_mlp_peft(config0, config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(mlp_peft, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with different target layers", + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoraConfig(target_modules=["lin1"], init_lora_weights=False), + ), + ( + "RoAd mixed adapter with different target layers", + RoadConfig(target_modules=["lin0"], group_size=2, init_weights=False), + RoadConfig(target_modules=["lin1"], group_size=2, init_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_different_target_layers(self, test_name, config0, config1): + base_model = MLP().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with modules to save", + LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"], init_lora_weights=False), + LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"], init_lora_weights=False), + ), + ( + "RoAd mixed adapter with modules to save", + RoadConfig(target_modules=["lin0"], modules_to_save=["lin1"], group_size=2, init_weights=False), + RoadConfig(target_modules=["lin0"], modules_to_save=["lin1"], group_size=2, init_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_multiple_modules_to_save(self, test_name, config0, config1): + base_model = MLP().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with unsupported layer", + LoraConfig(target_modules=["lin0"], modules_to_save=["gru"], init_lora_weights=False), + LoraConfig(target_modules=["lin0"], modules_to_save=["gru"], init_lora_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_unsupported_layer_raises(self, test_name, config0, config1): + base_model = MLPWithGRU().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + SUPPORTED_MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d) + module_names = ", ".join([module.__name__ for module in SUPPORTED_MODULES]) + with pytest.raises( + TypeError, match=f"Mixed batching is only supported for the following modules: {module_names}." + ): + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with overlapping layers", + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoraConfig(target_modules=["lin0", "lin1"], init_lora_weights=False), + ), + ( + "RoAd mixed adapter with overlapping layers", + RoadConfig(target_modules=["lin0"], group_size=2, init_weights=False), + RoadConfig(target_modules=["lin0", "lin1"], group_size=2, init_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_partly_overlapping_target_layers(self, test_name, config0, config1): + base_model = MLP().to(self.torch_device).eval() + # target different lora layers + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with conv1d", + LoraConfig(target_modules=["emb", "conv1d"], init_lora_weights=False), + LoraConfig(target_modules=["emb", "conv1d"], r=16, init_lora_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_lora_conv1d_emb(self, test_name, config0, config1): + base_model = ModelEmbConv1D().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with conv1d and emb and modules to save", + LoraConfig(target_modules=["emb", "conv1d"], modules_to_save=["lin0"], init_lora_weights=False), + LoraConfig(target_modules=["emb", "conv1d"], modules_to_save=["lin0"], init_lora_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_lora_conv1d_emb_multiple_modules_to_save(self, test_name, config0, config1): + base_model = ModelEmbConv1D().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with conv2d", + LoraConfig(target_modules=["conv2d"], init_lora_weights=False), + LoraConfig(target_modules=["conv2d"], r=16, init_lora_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_lora_conv2d(self, test_name, config0, config1): + base_model = ModelConv2D().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + + inputs = {"X": torch.arange(270).view(6, 5, 3, 3).to(self.torch_device)} + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1", + [ + ( + "LoRA mixed adapter with mha", + LoraConfig(target_modules=["mha"], init_lora_weights=False), + LoraConfig(target_modules=["mha"], r=16, init_lora_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_mha_raises(self, test_name, config0, config1): + base_model = ModelMha().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config0, "adapter0").eval() + peft_model.add_adapter("adapter1", config1) + + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + msg = "lora.MultiheadAttention does not support mixed adapter batches" + with pytest.raises(TypeError, match=msg): + self.run_checks(peft_model, inputs) + + @pytest.mark.parametrize("test_name, config0, config1", MIXED_ADAPTER_TEST_CASES) + def test_mixed_adapter_batches_length_mismatch_raises(self, test_name, config0, config1): + mlp_peft = self.get_mlp_peft(config0, config1) + inputs = { + "X": torch.arange(90).view(-1, 10).to(self.torch_device), + "adapter_names": ["__base__"] * 5, # wrong length! + } + msg = r"Length of `adapter_names` should be the same as the number of inputs, but got " + with pytest.raises(ValueError, match=msg): + mlp_peft.forward(**inputs) + + @pytest.mark.parametrize("test_name, config0, config1", MIXED_ADAPTER_TEST_CASES) + def test_mixed_adapter_batches_training_mode_raises(self, test_name, config0, config1): + mlp_peft = self.get_mlp_peft(config0, config1) + inputs = { + "X": torch.arange(90).view(-1, 10).to(self.torch_device), + "adapter_names": ["__base__"] * 9, + } + mlp_peft = mlp_peft.train() + msg = r"Cannot pass `adapter_names` when the model is in training mode." + with pytest.raises(ValueError, match=msg): + mlp_peft.forward(**inputs) + + @pytest.mark.parametrize("test_name, config0, config1", MIXED_ADAPTER_TEST_CASES) + def test_mixed_adapter_batches_disabled(self, test_name, config0, config1): + # Disabling adapters should have precedence over passing adapter names + mlp_peft = self.get_mlp_peft(config0, config1) + inputs = {"X": torch.arange(90).view(-1, 10).to(self.torch_device)} + with mlp_peft.disable_adapter(): + output_disabled = mlp_peft(**inputs) + + adapters = ["__base__", "adapter0", "adapter1"] + inputs["adapter_names"] = [adapters[i % 3] for i in (range(len(inputs["X"])))] + with mlp_peft.disable_adapter(): + output_mixed = mlp_peft.forward(**inputs) + + assert torch.allclose(output_disabled, output_mixed) + + @pytest.mark.parametrize("test_name, config0, config1", MIXED_ADAPTER_TEST_CASES) + def test_mixed_adapter_batches_merged_raises(self, test_name, config0, config1): + # When there are merged adapters, passing adapter names should raise an error + mlp_peft = self.get_mlp_peft(config0, config1) + inputs = { + "X": torch.arange(90).view(-1, 10).to(self.torch_device), + "adapter_names": ["adapter0"] * 9, + } + mlp_peft.merge_adapter(["adapter0"]) + msg = r"Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first." + with pytest.raises(ValueError, match=msg): + mlp_peft.forward(**inputs) + + @pytest.mark.parametrize( + "test_name, config", + [ + ( + "LoRA mixed batch wrong adapter name", + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + ), + ( + "RoAD mixed batch wrong adapter name", + RoadConfig(target_modules=["lin0"], group_size=2, init_weights=False), + ), + ], + ) + def test_mixed_adapter_batches_lora_wrong_adapter_name_raises(self, test_name, config): + # Ensure that all of the adapter names that are being passed actually exist + torch.manual_seed(0) + x = torch.arange(90).view(-1, 10).to(self.torch_device) + + base_model = MLP().to(self.torch_device).eval() + peft_model = get_peft_model(base_model, config).eval() + peft_model.add_adapter(adapter_name="other", peft_config=config) + + # sanity check: this works + peft_model.forward(x, adapter_names=["default"] * 5 + ["other"] * 4) + + # check one correct and one incorrect adapter + msg = re.escape("Trying to infer with non-existing adapter(s): does-not-exist") + with pytest.raises(ValueError, match=msg): + peft_model.forward(x, adapter_names=["default"] * 5 + ["does-not-exist"] * 4) + + # check two correct adapters and one incorrect adapter + with pytest.raises(ValueError, match=msg): + peft_model.forward(x, adapter_names=["default"] * 3 + ["does-not-exist"] * 4 + ["other"] * 2) + + # check only incorrect adapters + msg = re.escape("Trying to infer with non-existing adapter(s): does-not-exist, other-does-not-exist") + with pytest.raises(ValueError, match=msg): + peft_model.forward(x, adapter_names=["does-not-exist"] * 5 + ["other-does-not-exist"] * 4) + + def test_mixed_adapter_batches_lora_with_dora_raises(self): + # When there are DoRA adapters, passing adapter names should raise an error + torch.manual_seed(0) + inputs = { + "X": torch.arange(90).view(-1, 10).to(self.torch_device), + "adapter_names": ["default"] * 9, + } + + base_model = MLP().to(self.torch_device).eval() + config = LoraConfig(target_modules=["lin0"], init_lora_weights=False, use_dora=True) + peft_model = get_peft_model(base_model, config).eval() + msg = r"Cannot pass `adapter_names` when DoRA is enabled." + with pytest.raises(ValueError, match=msg): + peft_model.forward(**inputs) + + def test_mixed_adapter_batches_lora_with_dora_but_dora_not_included_works(self): + # When there are DoRA adapters, passing adapter names should raise an error, see previous test. However, when + # the adapter that uses DoRA is not included in adapter_names, it's actually fine. + torch.manual_seed(0) + base_model = MLP().to(self.torch_device).eval() + config_dora = LoraConfig(target_modules=["lin0"], init_lora_weights=False, use_dora=True) + peft_model = get_peft_model(base_model, config_dora) + config_no_dora = LoraConfig(target_modules=["lin0"], init_lora_weights=False, use_dora=False) + peft_model.add_adapter(adapter_name="other", peft_config=config_no_dora) + peft_model.eval() + + # The "default" adapter uses DoRA but "other" is not using it, so using "other" is fine. Also, "__base__" is + # fine since it uses the base model and thus DoRA is not involved either. + inputs = { + "X": torch.arange(90).view(-1, 10).to(self.torch_device), + "adapter_names": ["other"] * 4 + ["__base__"] * 5, + } + peft_model.forward(**inputs) + + @pytest.mark.parametrize( + "test_name, config0, config1, factor", + [ + ( + "LoRA mixed adapter timing", + LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False), + LoraConfig(task_type="CAUSAL_LM", r=16, init_lora_weights=False), + 2.0, + ), + ( + "RoAd mixed adapter timing", + RoadConfig(task_type="CAUSAL_LM", init_weights=False), + RoadConfig(task_type="CAUSAL_LM", variant="road_2", init_weights=False), + 3.0, + ), + ], + ) + @require_non_cpu + def test_mixed_adapter_batches_lora_opt_timing(self, test_name, config0, config1, factor): + # Use a more realistic model (opt-125m) and do a simple runtime check to ensure that mixed adapter batches + # don't add too much overhead. These types of tests are inherently flaky, so we try to add in some robustness. + logs = [] # store the time it takes to run each forward pass here + + @contextmanager + def timed(): + tic = time.perf_counter() + yield + toc = time.perf_counter() + logs.append(toc - tic) + + base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(self.torch_device).eval() + inputs = {"input_ids": torch.randint(0, 1000, (16, 64)).to(self.torch_device)} + with timed(): + output_base = base_model(**inputs).logits + + peft_model = get_peft_model(base_model, config0, "adapter1").eval() + with timed(): + output0 = peft_model(**inputs).logits + + # sanity check, outputs are not the same + assert not torch.allclose(output_base, output0) + + peft_model.add_adapter("adapter2", config1) + peft_model.set_adapter("adapter2") + with timed(): + output1 = peft_model(**inputs).logits + + # sanity check, outputs are not the same + assert not torch.allclose(output_base, output1) + + # set adapter_indices so that it alternates between 0 (base), lora 1, and lora 2 + adapters = ["__base__", "adapter1", "adapter2"] + inputs["adapter_names"] = [adapters[i % 3] for i in (range(len(inputs["input_ids"])))] + with timed(): + output_mixed = peft_model.forward(**inputs).logits + + atol, rtol = 1e-4, 1e-4 + assert torch.allclose(output_base[::3], output_mixed[::3], atol=atol, rtol=rtol) + assert torch.allclose(output0[1::3], output_mixed[1::3], atol=atol, rtol=rtol) + assert torch.allclose(output1[2::3], output_mixed[2::3], atol=atol, rtol=rtol) + + # Check that the overhead in time added by mixed batches is not too high. + # To prevent flakiness, we measure mixed inference 3 times and take the lowest value, then compare it to the mean + # of the non-mixed inference times. We also grant a generous margin of 2x the mean time. + with timed(): + output_mixed = peft_model.forward(**inputs).logits + with timed(): + output_mixed = peft_model.forward(**inputs).logits + + time_base, time0, time1, *time_mixed = logs + time_non_mixed = (time_base + time0 + time1) / 3 + time_mixed = min(time_mixed) + + assert time_mixed < factor * time_non_mixed + + # Measure timing of running base and adapter separately vs using a mixed batch. Note that on CPU, the + # differences are quite small, so this test requires GPU to avoid flakiness. + for _ in range(3): + with timed(): + with peft_model.disable_adapter(): + peft_model(**{k: v[::3] for k, v in inputs.items()}) + peft_model.set_adapter("adapter1") + peft_model(**{k: v[1::3] for k, v in inputs.items()}) + peft_model.set_adapter("adapter2") + peft_model(**{k: v[2::3] for k, v in inputs.items()}) + + times_separate = logs[-3:] + time_separate = sum(times_separate) / 3 + assert time_separate > time_mixed + + +class TestDynamicDispatch: + # These are tests for the dynamic dispatch feature for LoRA. We create a custom module and a custom LoRA layer + # that targets it. + + @pytest.fixture(scope="class") + def custom_module_cls(self): + class MyModule(nn.Module): + # A custom layer that just behaves like an nn.Linear layer but is not an instance of nn.Linear. Therefore, + # it would normally fail to be targeted. + def __init__(self): + super().__init__() + self.in_features = 10 + self.out_features = 20 + self.weight = nn.Parameter(torch.randn(20, 10)) + + def forward(self, x): + return nn.functional.linear(x, self.weight) + + return MyModule + + @pytest.fixture(scope="class") + def custom_lora_cls(self): + from peft.tuners import lora + + class MyLora(lora.Linear): + # just re-use the lora.Linear code here + pass + + return MyLora + + @pytest.fixture(scope="class") + def model_cls(self, custom_module_cls): + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.relu = nn.ReLU() + self.my_module = custom_module_cls() + self.lin1 = nn.Linear(20, 2) + + def forward(self, x): + x = self.relu(self.lin0(x)) + x = self.relu(self.my_module(x)) + x = self.lin1(x) + return x + + return MyModel + + def test_custom_lora_layer_used(self, custom_module_cls, custom_lora_cls, model_cls): + # check that when we register custom lora layers, they are indeed being used for the intended module + model = model_cls() + config = LoraConfig(target_modules=["lin0", "my_module", "lin1"]) + config._register_custom_module({custom_module_cls: custom_lora_cls}) + + peft_model = get_peft_model(model, config) + assert isinstance(peft_model.base_model.model.my_module, custom_lora_cls) + assert isinstance(peft_model.base_model.model.my_module.base_layer, custom_module_cls) + # sanity check that the other lora layer types are still the default ones + assert not isinstance(peft_model.base_model.model.lin0.base_layer, custom_module_cls) + assert not isinstance(peft_model.base_model.model.lin1.base_layer, custom_module_cls) + + def test_training_works(self, model_cls, custom_module_cls, custom_lora_cls): + # check that when we train with custom lora layers, they are indeed updated + model = model_cls() + config = LoraConfig(target_modules=["lin0", "my_module", "lin1"]) + config._register_custom_module({custom_module_cls: custom_lora_cls}) + + peft_model = get_peft_model(model, config) + sd_before = copy.deepcopy(peft_model.state_dict()) + inputs = torch.randn(16, 10) + optimizer = torch.optim.SGD(peft_model.parameters(), lr=1e-4) + + for _ in range(5): + optimizer.zero_grad() + output = peft_model(inputs) + loss = output.sum() ** 2 + loss.backward() + optimizer.step() + + sd_after = peft_model.state_dict() + + # sanity check that for finite results, since nan != nan, which would make the test pass trivially + for val in sd_before.values(): + assert torch.isfinite(val).all() + for val in sd_after.values(): + assert torch.isfinite(val).all() + + assert not torch.allclose( + sd_before["base_model.model.my_module.lora_A.default.weight"], + sd_after["base_model.model.my_module.lora_A.default.weight"], + ) + assert not torch.allclose( + sd_before["base_model.model.my_module.lora_B.default.weight"], + sd_after["base_model.model.my_module.lora_B.default.weight"], + ) + + def test_saving_and_loading(self, custom_module_cls, custom_lora_cls, model_cls, tmp_path): + # check that we can successfully save and load the custom lora cls + torch.manual_seed(0) + model = model_cls() + config = LoraConfig(target_modules=["lin0", "my_module", "lin1"]) + config._register_custom_module({custom_module_cls: custom_lora_cls}) + + torch.manual_seed(1) + peft_model = get_peft_model(model, config) + + inputs = torch.randn(5, 10) + outputs_before = peft_model(inputs) # does not raise + + sd_before = peft_model.state_dict() + peft_model.save_pretrained(tmp_path / "lora-custom-module") + del model, peft_model + + torch.manual_seed(0) # same seed for base model + model = model_cls() + + # custom lora mapping is not persisted at the moment, so as a workaround this is needed + config = LoraConfig.from_pretrained(tmp_path / "lora-custom-module") + config._register_custom_module({custom_module_cls: custom_lora_cls}) + + # different seed for adapter to ensure it is not identical just because of seed + torch.manual_seed(123) + peft_model = PeftModel.from_pretrained(model, tmp_path / "lora-custom-module", config=config) + assert isinstance(peft_model.base_model.model.my_module, custom_lora_cls) + assert isinstance(peft_model.base_model.model.my_module.base_layer, custom_module_cls) + + outputs_after = peft_model(inputs) # does not raise + assert torch.allclose(outputs_before, outputs_after) + + sd_after = peft_model.state_dict() + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert torch.allclose(sd_before[key], sd_after[key]) + + def test_override_lora_linear(self, custom_lora_cls): + # in this test, we check if users can override default PEFT behavior by supplying a custom lora class that is + # being used instead of lora.Linear + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + config = LoraConfig(task_type=TaskType.CAUSAL_LM) + config._register_custom_module({nn.Linear: custom_lora_cls}) + peft_model = get_peft_model(model, config) + layers = peft_model.base_model.model.model.decoder.layers + for layer in layers: + assert isinstance(layer.self_attn.v_proj, custom_lora_cls) + assert isinstance(layer.self_attn.q_proj, custom_lora_cls) + + def test_custom_lora_layer_issues_warning(self, custom_module_cls, custom_lora_cls, model_cls, recwarn): + # users will get a warning if they target a layer type that is not officially supported + model = model_cls() + config = LoraConfig(target_modules=["lin0", "my_module", "lin1"]) + config._register_custom_module({custom_module_cls: custom_lora_cls}) + + get_peft_model(model, config) + # check warning message + msg = ( + "Unsupported layer type '.MyModule'>' encountered, proceed at your own risk." + ) + assert str(recwarn.list[-1].message) == msg + + def test_target_layer_without_in_features_out_features(self, recwarn): + # It should be possible for users to target layers even if we cannot determine in_features and out_features. + # Those are only needed to initialize the LoRA layer via update_layer, so as long as users take care of that, + # they should be good and not require those attributes to exist + from peft.tuners import lora + + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.lstm = nn.LSTM(10, 20) + + class MyLora(nn.Module, lora.LoraLayer): + def __init__(self, base_layer, adapter_name, **kwargs): + super().__init__() + lora.LoraLayer.__init__(self, base_layer, **kwargs) + self._active_adapter = adapter_name + + model = MyModel() + # check that in_features and out_features attributes don't exist on LSTM + assert not hasattr(model.lstm, "in_features") + assert not hasattr(model.lstm, "out_features") + + config = LoraConfig(target_modules=["lstm"]) + config._register_custom_module({nn.LSTM: MyLora}) + peft_model = get_peft_model(model, config) + + # check that custom LoRA layer is correctly applied + assert isinstance(peft_model.base_model.lstm, MyLora) + assert isinstance(peft_model.base_model.lstm.base_layer, nn.LSTM) + + # we should still get a warning message + msg = "Unsupported layer type '' encountered, proceed at your own risk." + assert str(recwarn.list[-1].message) == msg diff --git a/peft/tests/test_decoder_models.py b/peft/tests/test_decoder_models.py new file mode 100644 index 0000000000000000000000000000000000000000..c0e3710194e0868000698462c3390fd8c37cd0d4 --- /dev/null +++ b/peft/tests/test_decoder_models.py @@ -0,0 +1,817 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import platform +import tempfile +from unittest.mock import Mock, call, patch + +import pytest +import torch +from safetensors.torch import load_file as safe_load_file +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + CPTConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LoraConfig, + MissConfig, + OFTConfig, + PrefixTuningConfig, + PromptEncoderConfig, + PromptTuningConfig, + PromptTuningInit, + RoadConfig, + ShiraConfig, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, + get_peft_model, +) + +from .testing_common import PeftCommonTester +from .testing_utils import device_count, hub_online_once, load_dataset_english_quotes, set_init_weights_false + + +PEFT_DECODER_MODELS_TO_TEST = [ + "hf-internal-testing/tiny-random-OPTForCausalLM", + "hf-internal-testing/tiny-random-GPT2LMHeadModel", + "hf-internal-testing/tiny-random-BloomForCausalLM", + "hf-internal-testing/tiny-random-gpt_neo", + "hf-internal-testing/tiny-random-GPTJForCausalLM", + "hf-internal-testing/tiny-random-GPTBigCodeForCausalLM", + "trl-internal-testing/tiny-random-LlamaForCausalLM", + "peft-internal-testing/tiny-dummy-qwen2", + "hf-internal-testing/tiny-random-Gemma3ForCausalLM", +] + +SMALL_GRID_MODELS = [ + "hf-internal-testing/tiny-random-gpt2", + "hf-internal-testing/tiny-random-OPTForCausalLM", + "hf-internal-testing/tiny-random-MistralForCausalLM", + "peft-internal-testing/tiny-dummy-qwen2", + "trl-internal-testing/tiny-random-LlamaForCausalLM", +] + + +# TODO Missing from this list are LoKr, LoHa, LN Tuning, add them +# Note: If the PEFT method offers an initialization option to make it an identity transform (typically via the +# init_weights argument), then this option should be set here, if it's not already the default. +ALL_CONFIGS = [ + ( + AdaLoraConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "total_step": 1, + }, + ), + ( + BOFTConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + }, + ), + ( + BoneConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "r": 2, + }, + ), + ( + MissConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "r": 2, + }, + ), + ( + CPTConfig, + { + "task_type": "CAUSAL_LM", + "cpt_token_ids": [0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing + "cpt_mask": [1, 1, 1, 1, 1, 1, 1, 1], + "cpt_tokens_type_mask": [1, 2, 2, 2, 3, 3, 4, 4], + }, + ), + ( + FourierFTConfig, + { + "task_type": "CAUSAL_LM", + "n_frequency": 10, + "target_modules": None, + }, + ), + ( + HRAConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + }, + ), + ( + IA3Config, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "feedforward_modules": None, + }, + ), + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + }, + ), + # Activated LoRA (aLoRA) + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "alora_invocation_tokens": [1], + }, + ), + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + # not one test input sequence will ever have this token, this should do nothing at all + "alora_invocation_tokens": [1000], + }, + ), + # LoRA + trainable tokens + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "trainable_token_indices": [0, 1, 3], + }, + ), + ( + OFTConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + }, + ), + ( + PrefixTuningConfig, + { + "task_type": "CAUSAL_LM", + "num_virtual_tokens": 10, + }, + ), + ( + PromptEncoderConfig, + { + "task_type": "CAUSAL_LM", + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + }, + ), + ( + PromptTuningConfig, + { + "task_type": "CAUSAL_LM", + "num_virtual_tokens": 10, + }, + ), + ( + RoadConfig, + { + "task_type": "CAUSAL_LM", + "variant": "road_1", + "group_size": 2, + }, + ), + ( + ShiraConfig, + { + "r": 1, + "task_type": "CAUSAL_LM", + "target_modules": None, + "init_weights": False, + }, + ), + ( + VBLoRAConfig, + { + "task_type": "CAUSAL_LM", + "target_modules": None, + "vblora_dropout": 0.05, + "vector_length": 1, + "num_vectors": 2, + }, + ), + ( + VeraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "target_modules": None, + "vera_dropout": 0.05, + "projection_prng_key": 0xFF, + "d_initial": 0.1, + "save_projection": True, + "bias": "none", + }, + ), + ( + C3AConfig, + { + "task_type": "CAUSAL_LM", + "block_size": 1, # Some test cases contain shapes of prime numbers where `block_size` must be 1 + "target_modules": None, + }, + ), + ( + WaveFTConfig, + { + "task_type": "CAUSAL_LM", + "n_frequency": 8, + "target_modules": None, + }, + ), +] + + +def _skip_if_not_conv1d_supported(model_id, config_cls): + if "GPT2LMHeadModel" in model_id and config_cls in [ + BOFTConfig, + BoneConfig, + HRAConfig, + OFTConfig, + RoadConfig, + ShiraConfig, + C3AConfig, + MissConfig, + ]: + pytest.skip("Skipping BOFT/HRA/OFT/Bone/Road/SHiRA/C3A/MiSS for GPT2LMHeadModel") + + +def _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls): + if "GPT2LMHeadModel" in model_id and config_cls in [ + AdaLoraConfig, + BOFTConfig, + HRAConfig, + OFTConfig, + BoneConfig, + C3AConfig, + RoadConfig, + MissConfig, + ]: + pytest.skip("Skipping AdaLora/BOFT/HRA/OFT/Bone/MiSS for GPT2LMHeadModel") + + +def _skip_alora_no_activation(config_cls, config_kwargs): + if config_cls is LoraConfig and config_kwargs.get("alora_invocation_tokens") == [1000]: + pytest.skip("Skipping aLoRA no-activation-case because the test expects changed output which there won't be.") + + +class TestDecoderModels(PeftCommonTester): + transformers_class = AutoModelForCausalLM + + def skipTest(self, reason=""): + # for backwards compatibility with unittest style test classes + pytest.skip(reason) + + def prepare_inputs_for_testing(self): + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + return {"input_ids": input_ids, "attention_mask": attention_mask} + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_attributes_parametrized(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_model_attr(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adapter_name(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_adapter_name(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prepare_for_training_parametrized(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prompt_tuning_text_prepare_for_training(self, model_id, config_cls, config_kwargs): + if config_cls != PromptTuningConfig: + pytest.skip(f"This test does not apply to {config_cls}") + config_kwargs = config_kwargs.copy() + config_kwargs["prompt_tuning_init"] = PromptTuningInit.TEXT + config_kwargs["prompt_tuning_init_text"] = "This is a test prompt." + config_kwargs["tokenizer_name_or_path"] = model_id + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + def test_prompt_tuning_text_tokenizer_kwargs(self): + # Allow users to pass additional arguments to Tokenizer.from_pretrained + # Fix for #1032 + mock = Mock() + orig_from_pretrained = AutoTokenizer.from_pretrained + + def mock_autotokenizer_from_pretrained(*args, **kwargs): + mock(*args, **kwargs) + return orig_from_pretrained(config.tokenizer_name_or_path) + + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + config = PromptTuningConfig( + base_model_name_or_path=model_id, + tokenizer_name_or_path=model_id, + num_virtual_tokens=10, + prompt_tuning_init=PromptTuningInit.TEXT, + task_type="CAUSAL_LM", + prompt_tuning_init_text="This is a test prompt.", + tokenizer_kwargs={"trust_remote_code": True, "foo": "bar"}, + ) + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + with patch("transformers.AutoTokenizer.from_pretrained", mock_autotokenizer_from_pretrained): + _ = get_peft_model(model, config) + expected_call = call(model_id, trust_remote_code=True, foo="bar") + assert mock.call_args == expected_call + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prompt_tuning_sample_vocab_prepare_for_training(self, model_id, config_cls, config_kwargs): + if config_cls != PromptTuningConfig: + pytest.skip(f"This test does not apply to {config_cls}") + + config_kwargs = config_kwargs.copy() + config_kwargs["prompt_tuning_init"] = PromptTuningInit.SAMPLE_VOCAB + config_kwargs["tokenizer_name_or_path"] = model_id + + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + def test_prompt_tuning_config_invalid_args(self): + # Raise an error when tokenizer_kwargs is used with prompt_tuning_init!='TEXT', because this argument has no + # function in that case + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + with pytest.raises(ValueError, match="tokenizer_kwargs only valid when using prompt_tuning_init='TEXT'."): + PromptTuningConfig( + base_model_name_or_path=model_id, + tokenizer_name_or_path=model_id, + num_virtual_tokens=10, + task_type="CAUSAL_LM", + prompt_tuning_init_text="This is a test prompt.", + prompt_tuning_init=PromptTuningInit.RANDOM, # <= should not be used together with tokenizer_kwargs + tokenizer_kwargs={"trust_remote_code": True, "foo": "bar"}, + ) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_pickle(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy(), safe_serialization=False) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters_pickle(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_save_pretrained_selected_adapters( + model_id, config_cls, config_kwargs.copy(), safe_serialization=False + ) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_multi(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_multi(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_nan(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_nan(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + if config_cls != LoraConfig: + pytest.skip("Mixed adapter batches not supported for this config.") + _skip_alora_no_activation(config_cls, config_kwargs) + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_with_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + if config_cls != LoraConfig: + pytest.skip("Mixed adapter batches not supported for this config.") + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_generate_with_mixed_adapter_batches_and_beam_search(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_generate(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_pos_args(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_generate_pos_args(model_id, config_cls, config_kwargs.copy(), raises_err=False) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): + self._test_merge_layers_fp16(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_half_prec(self, model_id, config_cls, config_kwargs): + self._test_generate_half_prec(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_kwargs): + self._test_prefix_tuning_half_prec_conversion(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_decoders(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_decoders_layer_indexing(self, model_id, config_cls, config_kwargs): + self._test_training_layer_indexing(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_decoders_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_inference_safetensors(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_inference_safetensors(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_peft_model_device_map(self, model_id, config_cls, config_kwargs): + self._test_peft_model_device_map(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_adapter(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_delete_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_unload_adapter(self, model_id, config_cls, config_kwargs): + _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls) + _skip_if_not_conv1d_supported(model_id, config_cls) + _skip_alora_no_activation(config_cls, config_kwargs) + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_unload_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_weighted_combination_of_adapters(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): + self._test_training_prompt_learning_tasks(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_disable_adapter(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + _skip_alora_no_activation(config_cls, config_kwargs) + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_disable_adapter(model_id, config_cls, config_kwargs.copy()) + + def test_generate_adalora_no_dropout(self): + # test for issue #730 + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + config_kwargs = { + "target_modules": None, + "task_type": "CAUSAL_LM", + "lora_dropout": 0.0, + "total_step": 1, + } + self._test_generate(model_id, AdaLoraConfig, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_passing_input_embeds_works(self, model_id, config_cls, config_kwargs): + _skip_if_not_conv1d_supported(model_id, config_cls) + if (platform.system() == "Darwin") and (config_cls == PrefixTuningConfig): + # the error is: + # > RuntimeError: unsupported operation: more than one element of the written-to tensor refers to a single + # > memory location. Please clone() the tensor before performing the operation. + # in transformers sdpa_mask_older_torch. As we (currently) cannot upgrade PyTorch on MacOS GH runners, we're + # stuck with this error. + # TODO: remove if torch can be upgraded on MacOS or if MacOS CI is removed + pytest.skip("Prefix tuning fails on MacOS in this case, not worth fixing") + self._test_passing_input_embeds_works("", model_id, config_cls, config_kwargs.copy()) + + def test_lora_layer_replication(self): + model_id = "trl-internal-testing/tiny-random-LlamaForCausalLM" + config_kwargs = { + "target_modules": ["down_proj", "up_proj"], + "task_type": "CAUSAL_LM", + "lora_dropout": 0.0, + "layer_replication": [[0, 1], [0, 2], [1, 2]], + } + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = LoraConfig(base_model_name_or_path=model_id, **config_kwargs) + + assert len(model.model.layers), "Expected 2 layers in original model." == 2 + model = get_peft_model(model, config) + layers = model.base_model.model.model.layers + assert len(layers) == 4, "Expected 4 layers in adapted model." + assert ( + layers[0].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + == layers[1].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + and layers[2].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + == layers[3].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + ), "Expected layers 0-1 and 2-3 to share weights" + assert ( + layers[0].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + != layers[2].mlp.up_proj.base_layer.weight.data.storage().data_ptr() + ), "Expected layers 0 and 2 to have different weights" + assert ( + layers[0].mlp.up_proj.lora_A.default.weight.data.storage().data_ptr() + != layers[1].mlp.up_proj.lora_A.default.weight.data.storage().data_ptr() + and layers[2].mlp.up_proj.lora_A.default.weight.data.storage().data_ptr() + != layers[3].mlp.up_proj.lora_A.default.weight.data.storage().data_ptr() + ), "Expected all LoRA adapters to have distinct weights" + assert len([n for n, _ in model.named_parameters() if ".lora_A." in n]) == 8, ( + "Expected 8 LoRA adapters since we are adding one each for up and down." + ) + self._test_prepare_for_training(model_id, LoraConfig, config_kwargs.copy()) + self._test_generate(model_id, LoraConfig, config_kwargs.copy()) + + def test_prompt_learning_with_grouped_query_attention(self): + # See 1901, fixes a bug with handling GQA + model_id = "peft-internal-testing/tiny-dummy-qwen2" + base_model = AutoModelForCausalLM.from_pretrained(model_id) + peft_config = PrefixTuningConfig(num_virtual_tokens=10, task_type="CAUSAL_LM") + model = get_peft_model(base_model, peft_config) + x = torch.tensor([[1, 2, 3]]) + # does not raise + model(x) + + def test_prefix_tuning_mistral(self): + # See issue 869, 1962 + model_id = "hf-internal-testing/tiny-random-MistralForCausalLM" + base_model = AutoModelForCausalLM.from_pretrained(model_id) + peft_config = PrefixTuningConfig(num_virtual_tokens=10, task_type="CAUSAL_LM") + model = get_peft_model(base_model, peft_config) + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token + + def process(samples): + tokenized = tokenizer(samples["quote"], truncation=True, max_length=128) + return tokenized + + data = load_dataset_english_quotes() + data = data.map(process, batched=True) + + with tempfile.TemporaryDirectory() as tmp_dirname: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + num_train_epochs=1, + max_steps=5, + per_device_train_batch_size=4, + output_dir=tmp_dirname, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + trainer.train() + + @pytest.mark.parametrize("model_id", SMALL_GRID_MODELS) + @pytest.mark.parametrize( + "config_cls,config_kwargs", + [ + ( + PromptTuningConfig, + { + "num_virtual_tokens": 10, + "task_type": "CAUSAL_LM", + }, + ), + ( + PrefixTuningConfig, + { + "num_virtual_tokens": 10, + "task_type": "CAUSAL_LM", + }, + ), + ( + PromptEncoderConfig, + { + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + "task_type": "CAUSAL_LM", + }, + ), + ( + CPTConfig, + { + "cpt_token_ids": [0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing + "cpt_mask": [1, 1, 1, 1, 1, 1, 1, 1], + "cpt_tokens_type_mask": [1, 2, 2, 2, 3, 3, 4, 4], + }, + ), + ], + ) + def test_prompt_learning_with_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + # See issue 869 + # Test prompt learning methods with gradient checkpointing in a semi realistic setting. + # Prefix tuning does not work if the model uses the new caching implementation. In that case, a helpful error + # should be raised. + + # skip if multi GPU, since this results in DataParallel usage by Trainer, which fails with "CUDA device + # assertion", breaking subsequent tests + if device_count > 1: + pytest.skip("Skip on multi-GPU setups") + peft_config = config_cls(base_model_name_or_path=model_id, **config_kwargs) + base_model = self.transformers_class.from_pretrained(model_id) + base_model.gradient_checkpointing_enable() + + try: + model = get_peft_model(base_model, peft_config) + except ValueError as exc: + # Some methods will raise a helpful error. After this, exit the test, as training would fail. + assert config_cls == PrefixTuningConfig + assert "Prefix tuning does not work with gradient checkpointing" in str(exc) + return + + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokenizer.pad_token = tokenizer.eos_token + + def process(samples): + tokenized = tokenizer(samples["quote"], truncation=True, max_length=128) + return tokenized + + data = load_dataset_english_quotes() + data = data.map(process, batched=True) + + with tempfile.TemporaryDirectory() as tmp_dirname: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + num_train_epochs=1, + max_steps=3, + per_device_train_batch_size=4, + output_dir=tmp_dirname, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + trainer.train() + + @pytest.mark.parametrize("save_embedding_layers", ["auto", True, False]) + @pytest.mark.parametrize( + "peft_config", + [ + (LoraConfig(target_modules=["lin0", "embed_tokens"], init_lora_weights=False)), + (LoraConfig(target_modules=r".*\.embed_tokens", init_lora_weights=False)), + ], + ) + def test_save_pretrained_targeting_lora_to_embedding_layer(self, save_embedding_layers, tmp_path, peft_config): + model_id = "trl-internal-testing/tiny-random-LlamaForCausalLM" + + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, peft_config) + + if save_embedding_layers == "auto": + # assert warning + msg_start = "Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`." + with pytest.warns(UserWarning, match=msg_start): + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + else: + model.save_pretrained(tmp_path, save_embedding_layers=save_embedding_layers) + + state_dict = safe_load_file(tmp_path / "adapter_model.safetensors") + contains_embedding = "base_model.model.model.embed_tokens.base_layer.weight" in state_dict + + if save_embedding_layers in ["auto", True]: + assert contains_embedding + assert torch.allclose( + model.base_model.model.model.embed_tokens.base_layer.weight, + state_dict["base_model.model.model.embed_tokens.base_layer.weight"], + ) + else: + assert not contains_embedding + + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_set_requires_grad_prompt_learning_raises(self, config_cls, config_kwargs): + # Test that for prompt learning, calling set_requires_grad raises an error with an appropriate error message. + # Note that for non-prompt learning methods, set_requires_grad is being tested for custom models, so there is no + # specific test here. + model_id = PEFT_DECODER_MODELS_TO_TEST[0] # it's enough to test this with one model + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + if not config.is_prompt_learning: + pytest.skip("This test is only for prompt learning methods.") + + with hub_online_once(model_id + config_kwargs.get("tokenizer_name_or_path", "")): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + model = get_peft_model(model, config) + msg = "Setting `requires_grad` is not supported for prompt learning methods like" + with pytest.raises(TypeError, match=msg): + model.set_requires_grad(adapter_names="adpater0") diff --git a/peft/tests/test_encoder_decoder_models.py b/peft/tests/test_encoder_decoder_models.py new file mode 100644 index 0000000000000000000000000000000000000000..d940d0f9a1d4e1b711f8f181030b1ab3559dfeec --- /dev/null +++ b/peft/tests/test_encoder_decoder_models.py @@ -0,0 +1,407 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import tempfile + +import pytest +import torch +from transformers import AutoModelForSeq2SeqLM, AutoModelForTokenClassification + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LoraConfig, + MissConfig, + OFTConfig, + PrefixTuningConfig, + PromptEncoderConfig, + PromptTuningConfig, + RoadConfig, + ShiraConfig, + TaskType, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, + get_peft_model, +) + +from .testing_common import PeftCommonTester +from .testing_utils import set_init_weights_false + + +PEFT_ENCODER_DECODER_MODELS_TO_TEST = [ + "ybelkada/tiny-random-T5ForConditionalGeneration-calibrated", + "hf-internal-testing/tiny-random-BartForConditionalGeneration", +] + +# TODO Missing from this list are LoKr, LoHa, LN Tuning, add them +ALL_CONFIGS = [ + ( + AdaLoraConfig, + { + "target_modules": None, + "total_step": 1, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + BOFTConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + BoneConfig, + { + "target_modules": None, + "r": 2, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + MissConfig, + { + "target_modules": None, + "r": 2, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + FourierFTConfig, + { + "n_frequency": 10, + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + HRAConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + IA3Config, + { + "target_modules": None, + "feedforward_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + LoraConfig, + { + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + LoraConfig, + { + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "trainable_token_indices": [0, 1, 3], + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + OFTConfig, + { + "target_modules": None, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + PrefixTuningConfig, + { + "num_virtual_tokens": 10, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + PromptEncoderConfig, + { + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + PromptTuningConfig, + { + "num_virtual_tokens": 10, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + RoadConfig, + { + "task_type": "SEQ_2_SEQ_LM", + "variant": "road_1", + "group_size": 2, + }, + ), + ( + ShiraConfig, + { + "r": 1, + "task_type": "SEQ_2_SEQ_LM", + "target_modules": None, + "init_weights": False, + }, + ), + ( + VBLoRAConfig, + { + "target_modules": None, + "vblora_dropout": 0.05, + "vector_length": 1, + "num_vectors": 2, + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + VeraConfig, + { + "r": 8, + "target_modules": None, + "vera_dropout": 0.05, + "projection_prng_key": 0xFF, + "d_initial": 0.1, + "save_projection": True, + "bias": "none", + "task_type": "SEQ_2_SEQ_LM", + }, + ), + ( + C3AConfig, + { + "task_type": "SEQ_2_SEQ_LM", + "block_size": 1, + "target_modules": None, + }, + ), + ( + WaveFTConfig, + { + "task_type": "SEQ_2_SEQ_LM", + "n_frequency": 8, + "target_modules": None, + }, + ), +] + + +class TestEncoderDecoderModels(PeftCommonTester): + transformers_class = AutoModelForSeq2SeqLM + + def skipTest(self, reason=""): + # for backwards compatibility with unittest style test classes + pytest.skip(reason) + + def prepare_inputs_for_testing(self): + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + decoder_input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + input_dict = { + "input_ids": input_ids, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + } + + return input_dict + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_attributes_parametrized(self, model_id, config_cls, config_kwargs): + self._test_model_attr(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adapter_name(self, model_id, config_cls, config_kwargs): + self._test_adapter_name(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prepare_for_training_parametrized(self, model_id, config_cls, config_kwargs): + self._test_prepare_for_training(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs, safe_serialization=False) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs, safe_serialization=False) + + def test_load_model_low_cpu_mem_usage(self): + # Using the first model with LoraConfig and an empty config_kwargs. + self._test_load_model_low_cpu_mem_usage(PEFT_ENCODER_DECODER_MODELS_TO_TEST[0], LoraConfig, {}) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_with_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_generate_with_mixed_adapter_batches_and_beam_search(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate(self, model_id, config_cls, config_kwargs): + self._test_generate(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_pos_args(self, model_id, config_cls, config_kwargs): + self._test_generate_pos_args(model_id, config_cls, config_kwargs, raises_err=True) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_half_prec(self, model_id, config_cls, config_kwargs): + self._test_generate_half_prec(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_kwargs): + self._test_prefix_tuning_half_prec_conversion(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_encoder_decoders(self, model_id, config_cls, config_kwargs): + self._test_training(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_encoder_decoders_layer_indexing(self, model_id, config_cls, config_kwargs): + self._test_training_layer_indexing(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_encoder_decoders_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_inference_safetensors(self, model_id, config_cls, config_kwargs): + self._test_inference_safetensors(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_peft_model_device_map(self, model_id, config_cls, config_kwargs): + self._test_peft_model_device_map(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): + self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_unload_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_unload_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_weighted_combination_of_adapters(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): + self._test_training_prompt_learning_tasks(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_ENCODER_DECODER_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_disable_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_disable_adapter(model_id, config_cls, config_kwargs) + + def test_active_adapters_prompt_learning(self): + model = AutoModelForSeq2SeqLM.from_pretrained( + "hf-internal-testing/tiny-random-BartForConditionalGeneration" + ).to(self.torch_device) + # any prompt learning method would work here + config = PromptEncoderConfig(task_type=TaskType.SEQ_2_SEQ_LM, num_virtual_tokens=10) + model = get_peft_model(model, config) + assert model.active_adapters == ["default"] + + def test_save_shared_tensors(self): + model_id = "hf-internal-testing/tiny-random-RobertaModel" + peft_config = LoraConfig( + task_type=TaskType.TOKEN_CLS, + inference_mode=False, + r=16, + lora_alpha=16, + lora_dropout=0.1, + bias="all", + ) + model = AutoModelForTokenClassification.from_pretrained(model_id, num_labels=11) + model = get_peft_model(model, peft_config) + with tempfile.TemporaryDirectory() as tmp_dir: + # This should work fine + model.save_pretrained(tmp_dir, safe_serialization=True) diff --git a/peft/tests/test_feature_extraction_models.py b/peft/tests/test_feature_extraction_models.py new file mode 100644 index 0000000000000000000000000000000000000000..d7dd604c979ba861655498169483327cc4e10100 --- /dev/null +++ b/peft/tests/test_feature_extraction_models.py @@ -0,0 +1,364 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch +from transformers import AutoModel + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LoraConfig, + MissConfig, + OFTConfig, + PrefixTuningConfig, + PromptEncoderConfig, + PromptLearningConfig, + PromptTuningConfig, + RoadConfig, + ShiraConfig, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, +) + +from .testing_common import PeftCommonTester +from .testing_utils import set_init_weights_false + + +PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST = [ + "hf-internal-testing/tiny-random-BertModel", + "hf-internal-testing/tiny-random-RobertaModel", + "hf-internal-testing/tiny-random-DebertaModel", + "hf-internal-testing/tiny-random-DebertaV2Model", +] + +# TODO Missing from this list are LoKr, LoHa, LN Tuning, add them +ALL_CONFIGS = [ + ( + AdaLoraConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "total_step": 1, + }, + ), + ( + BOFTConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + }, + ), + ( + BoneConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "r": 2, + }, + ), + ( + MissConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "r": 2, + }, + ), + ( + FourierFTConfig, + { + "task_type": "FEATURE_EXTRACTION", + "n_frequency": 10, + "target_modules": None, + }, + ), + ( + HRAConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + }, + ), + ( + IA3Config, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "feedforward_modules": None, + }, + ), + ( + LoraConfig, + { + "task_type": "FEATURE_EXTRACTION", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + }, + ), + # LoRA + trainable tokens + ( + LoraConfig, + { + "task_type": "FEATURE_EXTRACTION", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "trainable_token_indices": [0, 1, 3], + }, + ), + ( + OFTConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + }, + ), + ( + PrefixTuningConfig, + { + "task_type": "FEATURE_EXTRACTION", + "num_virtual_tokens": 10, + }, + ), + ( + PromptEncoderConfig, + { + "task_type": "FEATURE_EXTRACTION", + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + }, + ), + ( + PromptTuningConfig, + { + "task_type": "FEATURE_EXTRACTION", + "num_virtual_tokens": 10, + }, + ), + ( + RoadConfig, + { + "task_type": "FEATURE_EXTRACTION", + "variant": "road_1", + "group_size": 2, + }, + ), + ( + ShiraConfig, + { + "r": 1, + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "init_weights": False, + }, + ), + ( + VBLoRAConfig, + { + "task_type": "FEATURE_EXTRACTION", + "target_modules": None, + "vblora_dropout": 0.05, + "vector_length": 1, + "num_vectors": 2, + }, + ), + ( + VeraConfig, + { + "task_type": "FEATURE_EXTRACTION", + "r": 8, + "target_modules": None, + "vera_dropout": 0.05, + "projection_prng_key": 0xFF, + "d_initial": 0.1, + "save_projection": True, + "bias": "none", + }, + ), + ( + C3AConfig, + { + "task_type": "FEATURE_EXTRACTION", + "block_size": 1, + "target_modules": None, + }, + ), + ( + WaveFTConfig, + { + "task_type": "FEATURE_EXTRACTION", + "n_frequency": 8, + "target_modules": None, + }, + ), +] + + +def skip_non_prompt_learning(config_cls): + if not issubclass(config_cls, PromptLearningConfig) or (config_cls == PrefixTuningConfig): + pytest.skip("Skip tests that are not prompt learning or that are prefix tuning") + + +def skip_deberta_lora_tests(config_cls, model_id): + if "deberta" not in model_id.lower(): + return + + to_skip = ["lora", "ia3", "boft", "vera", "fourierft", "hra", "bone", "randlora"] + config_name = config_cls.__name__.lower() + if any(k in config_name for k in to_skip): + pytest.skip(f"Skip tests that use {config_name} for Deberta models") + + +def skip_deberta_pt_tests(config_cls, model_id): + if "deberta" not in model_id.lower(): + return + + to_skip = ["prefix"] + config_name = config_cls.__name__.lower() + if any(k in config_name for k in to_skip): + pytest.skip(f"Skip tests that use {config_name} for Deberta models") + + +class TestPeftFeatureExtractionModel(PeftCommonTester): + """ + Test if the PeftModel behaves as expected. This includes: + - test if the model has the expected methods + """ + + transformers_class = AutoModel + + def skipTest(self, reason=""): + # for backwards compatibility with unittest style test classes + pytest.skip(reason) + + def prepare_inputs_for_testing(self): + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + input_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + + return input_dict + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_attributes_parametrized(self, model_id, config_cls, config_kwargs): + self._test_model_attr(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adapter_name(self, model_id, config_cls, config_kwargs): + self._test_adapter_name(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prepare_for_training_parametrized(self, model_id, config_cls, config_kwargs): + self._test_prepare_for_training(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs) + + def test_load_model_low_cpu_mem_usage(self): + self._test_load_model_low_cpu_mem_usage(PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST[0], LoraConfig, {}) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training(self, model_id, config_cls, config_kwargs): + self._test_training(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): + skip_deberta_pt_tests(config_cls, model_id) + self._test_training_prompt_learning_tasks(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_layer_indexing(self, model_id, config_cls, config_kwargs): + self._test_training_layer_indexing(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_training_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + skip_deberta_lora_tests(config_cls, model_id) + self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_inference_safetensors(self, model_id, config_cls, config_kwargs): + self._test_inference_safetensors(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_peft_model_device_map(self, model_id, config_cls, config_kwargs): + self._test_peft_model_device_map(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_unload_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_unload_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_weighted_combination_of_adapters(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_FEATURE_EXTRACTION_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_passing_input_embeds_works(self, model_id, config_cls, config_kwargs): + skip_non_prompt_learning(config_cls) + self._test_passing_input_embeds_works("test input embeds work", model_id, config_cls, config_kwargs) diff --git a/peft/tests/test_gptqmodel.py b/peft/tests/test_gptqmodel.py new file mode 100644 index 0000000000000000000000000000000000000000..3d16b60d2dd89f851c717bc09ca11589e9e56ada --- /dev/null +++ b/peft/tests/test_gptqmodel.py @@ -0,0 +1,563 @@ +# Note: These tests were copied from test_common_gpu.py and test_gpu_examples.py as they can run on CPU too. +# +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gc +import os +import tempfile +import unittest + +import pytest +import torch +from accelerate.utils.memory import clear_device_cache +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + DataCollatorForLanguageModeling, + Trainer, + TrainingArguments, +) + +from peft import ( + AdaLoraConfig, + LoraConfig, + OFTConfig, + PeftModel, + get_peft_model, + prepare_model_for_kbit_training, +) +from peft.tuners.lora import GPTQLoraLinear +from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device + +from .testing_utils import ( + device_count, + load_dataset_english_quotes, + require_gptqmodel, + require_optimum, + require_torch_multi_accelerator, +) + + +@require_gptqmodel +class PeftGPTQModelCommonTests(unittest.TestCase): + r""" + A common tester to run common operations that are performed on GPU/CPU such as generation, loading in 8bit, etc. + """ + + def setUp(self): + self.causal_lm_model_id = "facebook/opt-350m" + self.device = infer_device() + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + gc.collect() + + def test_lora_gptq_quantization_from_pretrained_safetensors(self): + r""" + Tests that the gptqmodel quantization using LoRA works as expected with safetensors weights. + """ + from transformers import GPTQConfig + + model_id = "marcsun13/opt-350m-gptq-4bit" + quantization_config = GPTQConfig(bits=4, use_exllama=False) + kwargs = { + "pretrained_model_name_or_path": model_id, + "torch_dtype": torch.float16, + "device_map": "auto", + "quantization_config": quantization_config, + } + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig(task_type="CAUSAL_LM") + peft_model = get_peft_model(model, config) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A + + def test_oft_gptq_quantization_from_pretrained_safetensors(self): + r""" + Tests that the gptqmodel quantization using OFT works as expected with safetensors weights. + """ + from transformers import GPTQConfig + + model_id = "marcsun13/opt-350m-gptq-4bit" + quantization_config = GPTQConfig(bits=4, use_exllama=False) + kwargs = { + "pretrained_model_name_or_path": model_id, + "torch_dtype": torch.float16, + "device_map": "auto", + "quantization_config": quantization_config, + } + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = prepare_model_for_kbit_training(model) + + config = OFTConfig(task_type="CAUSAL_LM") + peft_model = get_peft_model(model, config) + peft_model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + with tempfile.TemporaryDirectory() as tmp_dir: + peft_model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(**kwargs) + model = PeftModel.from_pretrained(model, tmp_dir) + model = prepare_model_for_kbit_training(model) + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # loading a 2nd adapter works, #1239 + model.load_adapter(tmp_dir, "adapter2") + model.set_adapter("adapter2") + model.generate(input_ids=torch.LongTensor([[0, 2, 3, 1]]).to(peft_model.device)) + + # check that both adapters are in the same layer + assert "default" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.oft_R + assert "adapter2" in model.base_model.model.model.decoder.layers[0].self_attn.q_proj.oft_R + + +@require_gptqmodel +@require_optimum +class PeftGPTQModelTests(unittest.TestCase): + r""" + GPTQ + peft tests + """ + + def setUp(self): + from transformers import GPTQConfig + + self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit" + self.quantization_config = GPTQConfig(bits=4, backend="auto_trainable") + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + def test_causal_lm_training(self): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + def test_oft_causal_lm_training(self): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + model = prepare_model_for_kbit_training(model) + config = OFTConfig( + r=0, + oft_block_size=8, + target_modules=["q_proj", "v_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_adalora_causalLM(self): + r""" + Tests the gptq training with adalora + """ + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + peft_config = AdaLoraConfig( + total_step=40, + init_r=6, + target_r=4, + tinit=10, + tfinal=20, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, peft_config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True) + self._check_inference_finite(model, batch) + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_causal_lm_training_multi_accelerator(self): + r""" + Test the CausalLM training on a multi-accelerator device. The test would simply fail if the adapters are not + set correctly. + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_oft_causal_lm_training_multi_accelerator(self): + r""" + Test the CausalLM training on a multi-accelerator device. The test would simply fail if the adapters are not + set correctly. + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = OFTConfig( + r=0, + oft_block_size=8, + target_modules=["q_proj", "v_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + def test_non_default_adapter_name(self): + # See issue 1346 + config = LoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + # default adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config) + n_trainable_default, n_total_default = model.get_nb_trainable_parameters() + + # other adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config, adapter_name="other") + n_trainable_other, n_total_other = model.get_nb_trainable_parameters() + + assert n_trainable_other > 0 + # sanity check + assert n_trainable_default == n_trainable_other + assert n_total_default == n_total_other + + def test_oft_non_default_adapter_name(self): + # See issue 1346 + config = OFTConfig( + r=0, + oft_block_size=8, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + # default adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config) + n_trainable_default, n_total_default = model.get_nb_trainable_parameters() + + # other adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config, adapter_name="other") + n_trainable_other, n_total_other = model.get_nb_trainable_parameters() + + assert n_trainable_other > 0 + # sanity check + assert n_trainable_default == n_trainable_other + assert n_total_default == n_total_other + + def test_load_lora(self): + model_id = "ModelCloud/Llama-3.2-1B-gptqmodel-ci-4bit" + adapter_id = "ModelCloud/Llama-3.2-1B-gptqmodel-ci-4bit-lora" + + model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") + model.load_adapter(adapter_id) + + # assert dynamic rank + v_proj_module = model.model.layers[5].self_attn.v_proj + assert isinstance(v_proj_module, GPTQLoraLinear) + assert v_proj_module.lora_A["default"].weight.data.shape[0] == 128 + assert v_proj_module.lora_B["default"].weight.data.shape[1] == 128 + gate_proj_module = model.model.layers[5].mlp.gate_proj + assert isinstance(gate_proj_module, GPTQLoraLinear) + assert gate_proj_module.lora_A["default"].weight.data.shape[0] == 256 + assert gate_proj_module.lora_B["default"].weight.data.shape[1] == 256 + + tokenizer = AutoTokenizer.from_pretrained(model_id) + inp = tokenizer("Capital of France is", return_tensors="pt").to(model.device) + tokens = model.generate(**inp)[0] + result = tokenizer.decode(tokens) + + assert "paris" in result.lower() diff --git a/peft/tests/test_gpu_examples.py b/peft/tests/test_gpu_examples.py new file mode 100644 index 0000000000000000000000000000000000000000..e37d78d49e7c4445273bc76b822f1a917ed59f69 --- /dev/null +++ b/peft/tests/test_gpu_examples.py @@ -0,0 +1,5369 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import gc +import importlib +import itertools +import os +import re +import tempfile +import unittest +from collections import Counter, defaultdict +from copy import deepcopy +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Union + +import numpy as np +import pytest +import torch +from accelerate import infer_auto_device_map +from accelerate.test_utils.testing import run_command +from accelerate.utils import patch_environment +from accelerate.utils.imports import is_bf16_available +from accelerate.utils.memory import clear_device_cache +from accelerate.utils.versions import is_torch_version +from datasets import Audio, Dataset, DatasetDict, load_dataset +from packaging import version +from parameterized import parameterized +from torch.distributed import init_process_group +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.utils.data import DataLoader +from transformers import ( + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + Trainer, + TrainerCallback, + TrainingArguments, + WhisperFeatureExtractor, + WhisperForConditionalGeneration, + WhisperProcessor, + WhisperTokenizer, +) +from transformers.pytorch_utils import Conv1D + +from peft import ( + AdaLoraConfig, + ArrowConfig, + EvaConfig, + LoftQConfig, + LoraConfig, + PeftModel, + PrefixTuningConfig, + PromptEncoderConfig, + RandLoraConfig, + RoadConfig, + TaskType, + VeraConfig, + create_arrow_model, + get_peft_model, + get_peft_model_state_dict, + initialize_lora_eva_weights, + inject_adapter_in_model, + prepare_model_for_kbit_training, + replace_lora_weights_loftq, + set_peft_model_state_dict, +) +from peft.import_utils import is_diffusers_available, is_xpu_available +from peft.tuners import boft +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device +from peft.utils.hotswap import hotswap_adapter, prepare_model_for_compiled_hotswap +from peft.utils.loftq_utils import NFQuantizer +from peft.utils.other import fsdp_auto_wrap_policy +from tests.testing_utils import hub_online_once + +from .testing_utils import ( + device_count, + load_dataset_english_quotes, + require_aqlm, + require_auto_awq, + require_auto_gptq, + require_bitsandbytes, + require_deterministic_for_xpu, + require_eetq, + require_hqq, + require_non_cpu, + require_non_xpu, + require_optimum, + require_torch_gpu, + require_torch_multi_accelerator, + require_torch_multi_gpu, + require_torchao, + torch_device, +) + + +# Some tests with multi GPU require specific device maps to ensure that the models are loaded in two devices +DEVICE_MAP_MAP: dict[str, dict[str, int]] = { + "facebook/opt-6.7b": { + "model.decoder.embed_tokens": 0, + "model.decoder.embed_positions": 0, + "model.decoder.final_layer_norm": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 0, + "model.decoder.layers.7": 0, + "model.decoder.layers.8": 0, + "model.decoder.layers.9": 0, + "model.decoder.layers.10": 0, + "model.decoder.layers.11": 0, + "model.decoder.layers.12": 0, + "model.decoder.layers.13": 0, + "model.decoder.layers.14": 0, + "model.decoder.layers.15": 0, + "model.decoder.layers.16": 1, + "model.decoder.layers.17": 1, + "model.decoder.layers.18": 1, + "model.decoder.layers.19": 1, + "model.decoder.layers.20": 1, + "model.decoder.layers.21": 1, + "model.decoder.layers.22": 1, + "model.decoder.layers.23": 1, + "model.decoder.layers.24": 1, + "model.decoder.layers.25": 1, + "model.decoder.layers.26": 1, + "model.decoder.layers.27": 1, + "model.decoder.layers.28": 1, + "model.decoder.layers.29": 1, + "model.decoder.layers.30": 1, + "model.decoder.layers.31": 1, + "lm_head": 0, # tied with embed_tokens + }, + "facebook/opt-125m": { + "model.decoder.embed_tokens": 0, + "model.decoder.embed_positions": 0, + "model.decoder.final_layer_norm": 1, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "lm_head": 0, + }, + "marcsun13/opt-350m-gptq-4bit": { + "model.decoder.embed_tokens": 0, + "model.decoder.embed_positions": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "model.decoder.final_layer_norm": 1, + "lm_head": 0, # tied with embed_tokens + }, + "google/flan-t5-base": { + "shared": 0, + "encoder": 0, + "decoder": 1, + "final_layer_norm": 1, + "decoder.embed_tokens": 0, # tied with encoder.embed_tokens + "lm_head": 0, # tied with encoder.embed_tokens + }, +} + + +# A full testing suite that tests all the necessary features on GPU. The tests should +# rely on the example scripts to test the features. + + +@dataclass +class DataCollatorSpeechSeq2SeqWithPadding: + r""" + Directly copied from: + https://github.com/huggingface/peft/blob/main/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb + """ + + processor: Any + + def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]: + # split inputs and labels since they have to be of different lengths and need different padding methods + # first treat the audio inputs by simply returning torch tensors + input_features = [{"input_features": feature["input_features"]} for feature in features] + batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt") + + # get the tokenized label sequences + label_features = [{"input_ids": feature["labels"]} for feature in features] + # pad the labels to max length + labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt") + + # replace padding with -100 to ignore loss correctly + labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) + + # if bos token is appended in previous tokenization step, + # cut bos token here as it's append later anyways + if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item(): + labels = labels[:, 1:] + + batch["labels"] = labels + + return batch + + +@require_non_cpu +@require_bitsandbytes +class PeftBnbGPUExampleTests(unittest.TestCase): + r""" + A single GPU int8 + fp4 test suite, this will test if training fits correctly on a single GPU device (1x NVIDIA T4 + 16GB) using bitsandbytes. + + The tests are the following: + + - Seq2Seq model training based on: + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_flan_t5_large_bnb_peft.ipynb + - Causal LM model training based on: + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb + - Audio model training based on: + https://github.com/huggingface/peft/blob/main/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb + + """ + + def setUp(self): + self.seq2seq_model_id = "google/flan-t5-base" + self.causal_lm_model_id = "facebook/opt-6.7b" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + self.audio_model_id = "openai/whisper-large" + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training(self): + r""" + Test the CausalLM training on a single GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train + `opt-6.7b` on `english_quotes` dataset in few steps. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_4bit(self): + r""" + Test the CausalLM training on a single GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train + `opt-6.7b` on `english_quotes` dataset in few steps using 4bit base model. The test would simply fail if the + adapters are not set correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_4bit(self): + r""" + Test the CausalLM training on a multi-GPU device with 4bit base model. The test would simply fail if the + adapters are not set correctly. + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + @require_non_cpu + def test_4bit_adalora_causalLM(self): + r""" + Tests the 4bit training with adalora + """ + model_id = "facebook/opt-350m" + + # for >3 GPUs, might need: device_map={"": "cuda:0"} + model = AutoModelForCausalLM.from_pretrained( + model_id, quantization_config=BitsAndBytesConfig(load_in_4bit=True) + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + model.gradient_checkpointing_enable() + model = prepare_model_for_kbit_training(model) + + peft_config = AdaLoraConfig( + init_r=6, + target_r=4, + tinit=2, + tfinal=2, + total_step=6, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, peft_config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True) + self._check_inference_finite(model, batch) + + class OptimizerStepCallback(TrainerCallback): + def on_optimizer_step(self, args, state, control, **kwargs): + model.update_and_allocate(state.global_step) + + step_callback = OptimizerStepCallback() + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=6, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.add_callback(step_callback) + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + @require_non_cpu + def test_8bit_adalora_causalLM(self): + r""" + Tests the 8bit training with adalora + """ + model_id = "facebook/opt-350m" + + model = AutoModelForCausalLM.from_pretrained( + model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True) + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + + model.gradient_checkpointing_enable() + model = prepare_model_for_kbit_training(model) + + peft_config = AdaLoraConfig( + init_r=6, + target_r=4, + tinit=2, + tfinal=2, + total_step=6, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, peft_config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True) + self._check_inference_finite(model, batch) + + class OptimizerStepCallback(TrainerCallback): + def on_optimizer_step(self, args, state, control, **kwargs): + model.update_and_allocate(state.global_step) + + step_callback = OptimizerStepCallback() + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=6, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.add_callback(step_callback) + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_causal_lm_training_multi_gpu(self): + r""" + Test the CausalLM training on a multi-GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train + `opt-6.7b` on `english_quotes` dataset in few steps. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + print(f"device map: {model.hf_device_map}") + assert set(model.hf_device_map.values()) == set(range(device_count)) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_seq2seq_lm_training_single_gpu(self): + r""" + Test the Seq2SeqLM training on a single GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train + `flan-large` on `english_quotes` dataset in few steps. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map={"": 0}, + ) + + assert set(model.hf_device_map.values()) == {0} + + tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_seq2seq_lm_training_multi_gpu(self): + r""" + Test the Seq2SeqLM training on a multi-GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/Finetune_opt_bnb_peft.ipynb where we train + `flan-large` on `english_quotes` dataset in few steps. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForSeq2SeqLM.from_pretrained( + self.seq2seq_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map=DEVICE_MAP_MAP[self.seq2seq_model_id], + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + tokenizer = AutoTokenizer.from_pretrained(self.seq2seq_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir="outputs", + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + # TODO skipping to see if this leads to single GPU tests passing + @pytest.mark.skip + @pytest.mark.single_gpu_tests + def test_audio_model_training(self): + r""" + Test the audio model training on a single GPU device. This test is a converted version of + https://github.com/huggingface/peft/blob/main/examples/int8_training/peft_bnb_whisper_large_v2_training.ipynb + """ + with tempfile.TemporaryDirectory() as tmp_dir: + dataset_name = "ybelkada/common_voice_mr_11_0_copy" + task = "transcribe" + language = "Marathi" + common_voice = DatasetDict() + + common_voice["train"] = load_dataset(dataset_name, split="train+validation") + + common_voice = common_voice.remove_columns( + ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"] + ) + + feature_extractor = WhisperFeatureExtractor.from_pretrained(self.audio_model_id) + tokenizer = WhisperTokenizer.from_pretrained(self.audio_model_id, language=language, task=task) + processor = WhisperProcessor.from_pretrained(self.audio_model_id, language=language, task=task) + + common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000)) + + def prepare_dataset(batch): + # load and resample audio data from 48 to 16kHz + audio = batch["audio"] + + # compute log-Mel input features from input audio array + batch["input_features"] = feature_extractor( + audio["array"], sampling_rate=audio["sampling_rate"] + ).input_features[0] + + # encode target text to label ids + batch["labels"] = tokenizer(batch["sentence"]).input_ids + return batch + + common_voice = common_voice.map( + prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2 + ) + data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor) + + model = WhisperForConditionalGeneration.from_pretrained( + self.audio_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True), device_map="auto" + ) + + model.config.forced_decoder_ids = None + model.config.suppress_tokens = [] + + model = prepare_model_for_kbit_training(model) + + # as Whisper model uses Conv layer in encoder, checkpointing disables grad computation + # to avoid this, make the inputs trainable + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.model.encoder.conv1.register_forward_hook(make_inputs_require_grad) + + config = LoraConfig( + r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none" + ) + + model = get_peft_model(model, config) + model.print_trainable_parameters() + + training_args = Seq2SeqTrainingArguments( + output_dir=tmp_dir, # change to a repo name of your choice + per_device_train_batch_size=8, + gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size + learning_rate=1e-3, + warmup_steps=2, + max_steps=3, + fp16=True, + per_device_eval_batch_size=8, + generation_max_length=128, + logging_steps=25, + remove_unused_columns=False, # required as the PeftModel forward doesn't have the signature of the wrapped model's forward + label_names=["labels"], # same reason as above + ) + + trainer = Seq2SeqTrainer( + args=training_args, + model=model, + train_dataset=common_voice["train"], + data_collator=data_collator, + tokenizer=processor.feature_extractor, + ) + + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_4bit_non_default_adapter_name(self): + # See PR 1294 + config = LoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + + # default adapter name + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config) + n_trainable_default, n_total_default = model.get_nb_trainable_parameters() + + # other adapter name + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config, adapter_name="other") + n_trainable_other, n_total_other = model.get_nb_trainable_parameters() + + assert n_trainable_other > 0 + # sanity check + assert n_trainable_default == n_trainable_other + assert n_total_default == n_total_other + + @pytest.mark.single_gpu_tests + def test_8bit_non_default_adapter_name(self): + # See PR 1294 + config = LoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + + # default adapter name + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config) + n_trainable_default, n_total_default = model.get_nb_trainable_parameters() + + # other adapter name + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config, adapter_name="other") + n_trainable_other, n_total_other = model.get_nb_trainable_parameters() + + assert n_trainable_other > 0 + # sanity check + assert n_trainable_default == n_trainable_other + assert n_total_default == n_total_other + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_4bit_dora(self): + r""" + Same as test_causal_lm_training_4bit but with DoRA + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_4bit_dora(self): + r""" + Same as test_causal_lm_training_multi_gpu_4bit but with DoRA + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_8bit_dora(self): + r""" + Same as test_causal_lm_training_4bit_dora but with 8bit + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_8bit_dora(self): + r""" + Same as test_causal_lm_training_multi_gpu_4bit_dora but with 8bit + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_gpt2_dora(self): + r""" + Same as test_causal_lm_training_4bit but with DoRA + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto") + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @parameterized.expand(["4bit", "8bit"]) + def test_initialize_dora_with_bnb_on_cpu(self, kbit): + # 1674 + # The issue is that to initialize DoRA, we need to dequantize the weights. That only works on GPU for bnb. + # Therefore, initializing DoRA with bnb on CPU used to fail. + model_id = "facebook/opt-125m" + if kbit == "4bit": + bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4") + elif kbit == "8bit": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + raise ValueError("Only 4bit and 8bit bnb allowed") + + model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) + model = model.cpu() # ensure that we're on CPU + # sanity check that all weights are on CPU + weights_not_cpu = [name for name, p in model.named_parameters() if p.device != torch.device("cpu")] + assert not weights_not_cpu + + lora_config = LoraConfig(use_dora=True) + + # should not raise + peft_model = get_peft_model(model, lora_config) + # check that the weights are still on CPU + weights_not_cpu = [name for name, p in peft_model.named_parameters() if p.device != torch.device("cpu")] + assert not weights_not_cpu + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_vera(self): + r""" + Same as test_causal_lm_training but with VeRA + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_4bit_vera(self): + r""" + Same as test_causal_lm_training_4bit but with VeRA + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_vera(self): + r""" + Same as test_causal_lm_training_multi_gpu but with VeRA + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_4bit_vera(self): + r""" + Same as test_causal_lm_training_multi_gpu_4bit but with VeRA + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = VeraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + vera_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_8bit_randlora(self): + r""" + Same as test_causal_lm_training but with RandLora + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = RandLoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("ybelkada/english_quotes_copy") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_4bit_randlora(self): + r""" + Same as test_causal_lm_training_4bit but with RandLora + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = RandLoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("ybelkada/english_quotes_copy") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_8bit_randlora(self): + r""" + Same as test_causal_lm_training_multi_gpu but with RandLoRA + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = RandLoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_4bit_randlora(self): + r""" + Same as test_causal_lm_training_multi_gpu_4bit but with RandLora + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = RandLoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + randlora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_8bit_road(self): + r""" + Same as test_causal_lm_training but with RoAd + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = RoadConfig( + variant="road_1", + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("ybelkada/english_quotes_copy") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=1e-3, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_4bit_road(self): + r""" + Same as test_causal_lm_training_4bit but with RoAd + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + device_map="auto", + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + config = RoadConfig( + variant="road_1", + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("ybelkada/english_quotes_copy") + data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=1e-3, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_8bit_road(self): + r""" + Same as test_causal_lm_training_multi_gpu but with RoAd + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_8bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = RoadConfig( + variant="road_1", + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=1e-3, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + def test_causal_lm_training_multi_gpu_4bit_road(self): + r""" + Same as test_causal_lm_training_multi_gpu_4bit but with RoAd + """ + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = RoadConfig( + variant="road_1", + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset("Abirate/english_quotes") + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=1e-3, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_lora_resize_embeddings_trainable_tokens(self): + r""" + Test LoRA with trainable tokens on a resized embedding matrix + """ + with tempfile.TemporaryDirectory() as tmp_dir: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_quant_storage=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + quantization_config=bnb_config, + device_map="auto", + ) + + # add 2 new tokens + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + new_tokens = ["", ""] + tokenizer.add_special_tokens({"additional_special_tokens": new_tokens}) + trainable_token_indices = [tokenizer.vocab[token] for token in new_tokens] + + cur_emb_size = model.model.decoder.embed_tokens.weight.shape[0] + model.resize_token_embeddings(max(tokenizer.vocab_size, cur_emb_size)) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + trainable_token_indices={"embed_tokens": trainable_token_indices}, + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + + def tokenize(samples): + # add new tokens to samples + samples = [f"{row}" for row in samples["quote"]] + return tokenizer(samples) + + data = data.map(tokenize, batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + # higher learning rate, as embeddings are a bit slow to update + learning_rate=1e-3, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + # ensure that the new trainable tokens have been updated + embedding = model.base_model.model.model.decoder.embed_tokens + tol = 1e-4 + assert not torch.allclose( + embedding.token_adapter.trainable_tokens_delta["default"], + embedding.original_module.weight[trainable_token_indices], + atol=tol, + rtol=tol, + ) + + # check size of the checkpoint, should be small since the embedding matrix does not need to be stored + stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME)) + embed_params = model.base_model.model.model.decoder.embed_tokens.original_module.weight.numel() + # fp32 -> 4x + emb_file_size = 4 * embed_params + assert stat.st_size < emb_file_size + + # sanity check: assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + +@require_torch_gpu +@require_auto_gptq +@require_optimum +class PeftGPTQGPUTests(unittest.TestCase): + r""" + GPTQ + peft tests + """ + + def setUp(self): + from transformers import GPTQConfig + + self.causal_lm_model_id = "marcsun13/opt-350m-gptq-4bit" + # TODO : check if it works for Exllamav2 kernels + self.quantization_config = GPTQConfig(bits=4, use_exllama=False) + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training(self): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_adalora_causalLM(self): + r""" + Tests the gptq training with adalora + """ + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + model = prepare_model_for_kbit_training(model) + + peft_config = AdaLoraConfig( + init_r=6, + target_r=4, + tinit=2, + tfinal=2, + total_step=6, + deltaT=5, + beta1=0.3, + beta2=0.3, + orth_reg_weight=0.2, + lora_alpha=32, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, peft_config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + batch = tokenizer(data["train"][:3]["quote"], return_tensors="pt", padding=True) + self._check_inference_finite(model, batch) + + class OptimizerStepCallback(TrainerCallback): + def on_optimizer_step(self, args, state, control, **kwargs): + model.update_and_allocate(state.global_step) + + step_callback = OptimizerStepCallback() + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=6, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.add_callback(step_callback) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_gptq_qalora(self): + """ + Test QALoRA with GPTQ quantization. The test would simply fail if the adapters are not set correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_qalora=True, + qalora_group_size=32, + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_gpu + def test_causal_lm_training_multi_gpu(self): + r""" + Test the CausalLM training on a multi-GPU device. The test would simply fail if the adapters are not set + correctly. + """ + device_map = { + "model.decoder.embed_tokens": 0, + "lm_head": 0, + "model.decoder.embed_positions": 0, + "model.decoder.project_out": 0, + "model.decoder.project_in": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "model.decoder.final_layer_norm": 1, + } + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map=device_map, + quantization_config=self.quantization_config, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + fp16=True, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_non_default_adapter_name(self): + # See issue 1346 + config = LoraConfig( + r=16, + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + ) + + # default adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config) + n_trainable_default, n_total_default = model.get_nb_trainable_parameters() + + # other adapter name + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + device_map="auto", + quantization_config=self.quantization_config, + ) + model = prepare_model_for_kbit_training(model) + model = get_peft_model(model, config, adapter_name="other") + n_trainable_other, n_total_other = model.get_nb_trainable_parameters() + + assert n_trainable_other > 0 + # sanity check + assert n_trainable_default == n_trainable_other + assert n_total_default == n_total_other + + +@require_non_cpu +class OffloadSaveTests(unittest.TestCase): + def setUp(self): + self.causal_lm_model_id = "gpt2" + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def test_offload_load(self): + r""" + Test the loading of a LoRA model with CPU- and disk-offloaded modules + """ + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id) + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + memory_limits = {"cpu": "0.4GIB"} # no "disk" for PeftModel.from_pretrained() compatibility + + # offload around half of all transformer modules to the disk + device_map = infer_auto_device_map(model, max_memory=memory_limits) + assert "cpu" in device_map.values() + assert "disk" in device_map.values() + + config = LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False, target_modules=["c_attn"]) + + model = get_peft_model(model, config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map="cpu") + lora_model = PeftModel.from_pretrained(model, tmp_dir).eval() + input_tokens = tokenizer.encode("Four score and seven years ago", return_tensors="pt") + output = lora_model(input_tokens)[0] + + # load the model with device_map + offloaded_model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map=device_map) + assert len({p.device for p in offloaded_model.parameters()}) == 2 # 'cpu' and 'meta' + offloaded_lora_model = PeftModel.from_pretrained(offloaded_model, tmp_dir, max_memory=memory_limits).eval() + offloaded_output = offloaded_lora_model(input_tokens)[0] + assert torch.allclose(output, offloaded_output, atol=1e-5) + + @pytest.mark.single_gpu_tests + def test_offload_merge(self): + r""" + Test merging, unmerging, and unloading of a model with CPU- and disk- offloaded modules. + """ + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id) + tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + memory_limits = {0: "0.2GIB", "cpu": "0.2GIB"} # no "disk" for PeftModel.from_pretrained() compatibility + # offloads around half of all transformer modules + device_map = infer_auto_device_map(model, max_memory=memory_limits) + assert 0 in device_map.values() + assert "cpu" in device_map.values() + assert "disk" in device_map.values() + + config = LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False, target_modules=["c_attn"]) + + model = get_peft_model(model, config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + # load the model with device_map + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, device_map=device_map).eval() + assert len({p.device for p in model.parameters()}) == 2 + + model = PeftModel.from_pretrained(model, tmp_dir, max_memory=memory_limits) + + input_tokens = tokenizer.encode("Four score and seven years ago", return_tensors="pt") + model.eval() + + # test peft model adapter merge + pre_merge_olayer = model(input_tokens)[0] + model.merge_adapter() + post_merge_olayer = model(input_tokens)[0] + assert torch.allclose(post_merge_olayer, pre_merge_olayer) + + # test peft model adapter unmerge + model.unmerge_adapter() + post_unmerge_olayer = model(input_tokens)[0] + assert torch.allclose(post_unmerge_olayer, pre_merge_olayer) + + # test LoRA merge and unload + model = model.merge_and_unload() + post_unload_merge_olayer = model(input_tokens)[0] + assert torch.allclose(post_unload_merge_olayer, pre_merge_olayer) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a GPU or XPU") +@pytest.mark.single_gpu_tests +class TestPiSSA: + r""" + Tests for PiSSA to ensure that it reduces the quantization error compared to normal LoRA quantization. + """ + + # The error factor indicates by how much the quantization error should be decreased when using PiSSA compared to + # quantization without PiSSA. Thus 1.03 means that the error should be decreased by 3% at least. This is a very + # conservative value to prevent flakiness, in practice most gains are > 1.5 + error_factor = 1.03 + + def quantize_model(self, model, num_bits=4, device="cuda"): + # Quantize the `weight.data` of the linear layer in the model to `num_bits` and store it with full precision. + quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64) + for name, module in model.named_modules(): + if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name: + quantized_weight, max_abs, shape = quantizer.quantize_block(module.weight.data.to(device)) + module.weight.data = quantizer.dequantize_block(quantized_weight, max_abs, shape) + return model + + def nuclear_norm(self, base_model, quantized_model): + # Calculate the nuclear norm (sum of singular values) of the error matrices between the `quantized_model` and the `base_model`. + error_list = [] + for name, module in base_model.named_modules(): + if isinstance(module, (torch.nn.Linear, Conv1D)) and "lm_head" not in name: + quant_module = quantized_model.get_submodule(name) + error_list.append(torch.linalg.svdvals(module.weight.data - quant_module.weight.data).sum()) + return torch.Tensor(error_list).sum() + + def get_errors( + self, + tmp_path, + bits=4, + device="cuda", + model_id="hf-internal-testing/tiny-random-BloomForCausalLM", + ): + # Comparing the quantized LoRA model to the base model, vs the PiSSA quantized model to the base model. + # We expect the PiSSA quantized model to have less error than the normal LoRA quantized model. + + cls = AutoModelForSeq2SeqLM if "t5" in str(model_id) else AutoModelForCausalLM + base_model = cls.from_pretrained(model_id).eval().to(device) + task_type = TaskType.SEQ_2_SEQ_LM if base_model.config.is_encoder_decoder else TaskType.CAUSAL_LM + + # logits from the normal quantized LoRA model + target_modules = "all-linear" if task_type != TaskType.SEQ_2_SEQ_LM else ["o", "k", "wi", "q", "v"] + lora_config = LoraConfig(task_type=task_type, target_modules=target_modules) + + qlora_model = self.quantize_model(cls.from_pretrained(model_id).eval().to(device), bits, device) + qlora_model = get_peft_model( + qlora_model, + lora_config, + ) + qlora_model = qlora_model.merge_and_unload() + qlora_error = self.nuclear_norm(base_model, qlora_model) + del qlora_model + clear_device_cache(garbage_collection=True) + + # logits from quantized LoRA model using PiSSA + lora_config = LoraConfig( + task_type=task_type, + init_lora_weights="pissa", + target_modules=target_modules, + ) + pissa_model = cls.from_pretrained(model_id).eval().to(device) + pissa_model = get_peft_model(pissa_model, lora_config) + + # save LoRA weights, they should be initialized such that they minimize the quantization error + pissa_model.base_model.peft_config["default"].init_lora_weights = True + pissa_model.save_pretrained(tmp_path / "pissa_model") + + pissa_model = pissa_model.unload() + pissa_model.save_pretrained(tmp_path / "residual_model") + + del pissa_model + clear_device_cache(garbage_collection=True) + + # now load quantized model and apply PiSSA-initialized weights on top + qpissa_model = self.quantize_model( + cls.from_pretrained(tmp_path / "residual_model").eval().to(device), bits, device + ) + qpissa_model = PeftModel.from_pretrained(qpissa_model, tmp_path / "pissa_model") + qpissa_model = qpissa_model.merge_and_unload() + qpissa_error = self.nuclear_norm(base_model, qpissa_model) + del qpissa_model + clear_device_cache(garbage_collection=True) + + assert qlora_error > 0.0 + assert qpissa_error > 0.0 + + # next, check that PiSSA quantization errors are smaller than LoRA errors by a certain margin + assert qpissa_error < (qlora_error / self.error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_pissa_4bit(self, device, tmp_path): + # In this test, we compare the logits of the base model, the quantized LoRA model, and the quantized model + # using PiSSA. When quantizing, we expect a certain level of error. However, we expect the PiSSA quantized + # model to have less error than the normal LoRA quantized model. Note that when using normal LoRA, the + # quantization error is simply the error from quantization without LoRA, as LoRA is a no-op before training. + # We still apply LoRA for the test for consistency. + + self.get_errors(bits=4, device=device, tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_pissa_8bit(self, device, tmp_path): + # Same test as test_bloomz_pissa_4bit but with 8 bits. + self.get_errors(bits=8, device=device, tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_t5_pissa_4bit(self, device, tmp_path): + self.get_errors(bits=4, device=device, model_id="t5-small", tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_t5_pissa_8bit(self, device, tmp_path): + self.get_errors(bits=8, device=device, model_id="t5-small", tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_gpt2_pissa_4bit(self, device, tmp_path): + # see 2104 + self.get_errors(bits=4, device=device, model_id="gpt2", tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_gpt2_pissa_8bit(self, device, tmp_path): + # see 2104 + self.get_errors(bits=8, device=device, model_id="gpt2", tmp_path=tmp_path) + + @require_bitsandbytes + def test_lora_pissa_conversion_same_output_after_loading_with_quantization(self, tmp_path): + # A copy of the test `test_lora_pissa_conversion_same_output_after_loading` in peft/tests/test_initialization.py, + # that would fail if bitsandbytes quantization is used because Quant(W_res) + AB !=Quant(W) + \Delta(AB). + import bitsandbytes as bnb + + torch.manual_seed(0) + data = torch.rand(10, 1000).to(torch_device) + + class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = torch.nn.Linear(1000, 1000) + self.embed = torch.nn.Embedding(1000, 1000) + self.conv2d = torch.nn.Conv2d(100, 100, 3) + + def forward(self, x): + x_int = (100 * x).int() + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.embed(x_int), self.conv2d(x_4d) + + model = MyModule().to(torch_device) + output_base = model(data)[0] + + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model = peft_model.unload() + torch.save(peft_model.state_dict(), tmp_path / "residual-model") + del peft_model + + # create 4bit base model + base_model = deepcopy(model) + base_model.load_state_dict(torch.load(tmp_path / "residual-model")) + # sanity check: the base model weights were indeed changed + tol = 1e-06 + assert not torch.allclose(model.linear.weight, base_model.linear.weight, atol=tol, rtol=tol) + # quantize the linear layer + linear4bit = bnb.nn.Linear4bit(base_model.linear.in_features, base_model.linear.out_features) + linear4bit.load_state_dict(base_model.linear.state_dict()) + linear4bit.to(0) + base_model.linear = linear4bit + peft_model = PeftModel.from_pretrained(deepcopy(base_model), tmp_path / "init-model") + output_quantized_pissa = peft_model(data)[0] + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_quantized_pissa, atol=tol, rtol=tol) + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_finetuned_pissa = peft_model(data)[0] + # sanity check + tol = 1e-06 + assert not torch.allclose(output_quantized_pissa, output_finetuned_pissa, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "pissa-model") + model_loaded = PeftModel.from_pretrained(deepcopy(base_model), tmp_path / "pissa-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_finetuned_pissa, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "pissa-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model-converted") + output_converted = model_converted(data)[0] + + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + # This check is expected to fail when using bnb + assert not torch.allclose(output_finetuned_pissa, output_converted, atol=tol, rtol=tol) + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a GPU or XPU") +@pytest.mark.single_gpu_tests +class TestOLoRA: + r""" + Tests for OLoRA to ensure that it reduces the quantization error compared to normal LoRA quantization. + """ + + # The error factor indicates by how much the quantization error should be decreased when using OLoRA compared to + # quantization without OLoRA. Thus 1.03 means that the error should be decreased by 3% at least. This is a very + # conservative value to prevent flakiness, in practice most gains are > 1.5 + error_factor = 1.2 + + def quantize_model(self, model, num_bits=4, device="cuda"): + # Quantize the `weight.data` of the linear layer in the model to `num_bits` and store it with full precision. + quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64) + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear) and "lm_head" not in name: + quantized_weight, max_abs, shape = quantizer.quantize_block(module.weight.data.to(device)) + module.weight.data = quantizer.dequantize_block(quantized_weight, max_abs, shape) + return model + + def nuclear_norm(self, base_model, quantized_model): + # Calculate the nuclear norm (sum of singular values) of the error matrices between the `quantized_model` and the `base_model`. + error_list = [] + for name, module in base_model.named_modules(): + if isinstance(module, torch.nn.Linear) and "lm_head" not in name: + quant_module = quantized_model.get_submodule(name) + error_list.append(torch.linalg.svdvals(module.weight.data - quant_module.weight.data).sum()) + return torch.Tensor(error_list).sum() + + def get_errors( + self, + tmp_path, + bits=4, + device="cuda", + model_id="hf-internal-testing/tiny-random-BloomForCausalLM", + ): + # Comparing the quantized LoRA model to the base model, vs the OLoRA quantized model to the base model. + # We expect the OLoRA quantized model to have less error than the normal LoRA quantized model. + + cls = AutoModelForSeq2SeqLM if "t5" in str(model_id) else AutoModelForCausalLM + base_model = cls.from_pretrained(model_id).eval().to(device) + task_type = TaskType.SEQ_2_SEQ_LM if base_model.config.is_encoder_decoder else TaskType.CAUSAL_LM + + # logits from the normal quantized LoRA model + target_modules = "all-linear" if task_type != TaskType.SEQ_2_SEQ_LM else ["o", "k", "wi", "q", "v"] + lora_config = LoraConfig(task_type=task_type, target_modules=target_modules) + + qlora_model = self.quantize_model(cls.from_pretrained(model_id).eval().to(device), bits, device) + qlora_model = get_peft_model( + qlora_model, + lora_config, + ) + qlora_model = qlora_model.merge_and_unload() + qlora_error = self.nuclear_norm(base_model, qlora_model) + del qlora_model + clear_device_cache(garbage_collection=True) + + # logits from quantized LoRA model using OLoRA + lora_config = LoraConfig( + task_type=task_type, + init_lora_weights="olora", + target_modules=target_modules, + ) + olora_model = cls.from_pretrained(model_id).eval().to(device) + olora_model = get_peft_model(olora_model, lora_config) + + # save LoRA weights, they should be initialized such that they minimize the quantization error + olora_model.base_model.peft_config["default"].init_lora_weights = True + olora_model.save_pretrained(tmp_path / "olora_model") + + olora_model = olora_model.unload() + olora_model.save_pretrained(tmp_path / "residual_model") + + del olora_model + clear_device_cache(garbage_collection=True) + + # now load quantized model and apply OLoRA-initialized weights on top + qolora_model = self.quantize_model( + cls.from_pretrained(tmp_path / "residual_model").eval().to(device), bits, device + ) + qolora_model = PeftModel.from_pretrained(qolora_model, tmp_path / "olora_model") + qolora_model = qolora_model.merge_and_unload() + qolora_error = self.nuclear_norm(base_model, qolora_model) + del qolora_model + clear_device_cache(garbage_collection=True) + + assert qlora_error > 0.0 + assert qolora_error > 0.0 + + # next, check that OLoRA quantization errors are smaller than LoRA errors by a certain margin + assert qolora_error < (qlora_error / self.error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_olora_4bit(self, device, tmp_path): + # In this test, we compare the logits of the base model, the quantized LoRA model, and the quantized model + # using OLoRA. When quantizing, we expect a certain level of error. However, we expect the OLoRA quantized + # model to have less error than the normal LoRA quantized model. Note that when using normal LoRA, the + # quantization error is simply the error from quantization without LoRA, as LoRA is a no-op before training. + # We still apply LoRA for the test for consistency. + + self.get_errors(bits=4, device=device, tmp_path=tmp_path) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_olora_8bit(self, device, tmp_path): + # Same test as test_bloomz_olora_4bit but with 8 bits. + self.get_errors(bits=8, device=device, tmp_path=tmp_path) + + @pytest.mark.parametrize("bits", [4, 8]) + def test_olora_with_quantized_model(self, bits): + import bitsandbytes as bnb + + # issue 1999 + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + if bits == 4: + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_quant_storage=torch.float16, + bnb_4bit_use_double_quant=True, + ) + elif bits == 8: + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + raise ValueError("bits must be 4 or 8") + + model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) + model = prepare_model_for_kbit_training(model) + config = LoraConfig(init_lora_weights="olora") + model = get_peft_model(model, config) + + # check that the correct type is used for the weights + base_layer = model.base_model.model.model.decoder.layers[0].self_attn.v_proj.base_layer.weight + if bits == 4: + assert isinstance(base_layer, bnb.nn.modules.Params4bit) + else: + assert isinstance(base_layer, bnb.nn.modules.Int8Params) + + inputs = torch.arange(10).unsqueeze(0).to(model.device) + logits = model(inputs).logits # does not raise + assert torch.isfinite(logits).all() + + +@pytest.mark.skipif( + not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a hardware accelerator" +) +@pytest.mark.single_gpu_tests +@require_bitsandbytes +class TestLoftQ: + r""" + Tests for LoftQ to ensure that it reduces the quantization error compared to normal LoRA quantization. + """ + + def get_error_factor(self, device): + # The error factor indicates by how much the quantization error should be decreased when using LoftQ compared to + # quantization without LoftQ. Thus 1.03 means that the error should be decreased by 3% at least. This is a very + # conservative value to prevent flakiness, in practice most gains are > 1.5 + error_factor = 1.005 if device in ("xpu", "cpu") else 1.03 + return error_factor + + def get_input(self, model_id, device): + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer("All I want is", padding=True, return_tensors="pt") + inputs = inputs.to(device) + return inputs + + def get_base_model(self, model_id, device, **kwargs): + cls = AutoModelForSeq2SeqLM if "t5" in str(model_id) else AutoModelForCausalLM + model = cls.from_pretrained(model_id, device_map=device, **kwargs).eval() + return model + + def get_logits(self, model, inputs): + if model.config.is_encoder_decoder: + input_ids = inputs["input_ids"] + return model(input_ids=input_ids, decoder_input_ids=input_ids).logits + return model(**inputs).logits + + def get_errors( + self, + tmp_path, + bits=4, + loftq_iter=1, + device="cuda", + model_id="hf-internal-testing/tiny-random-BloomForCausalLM", + use_dora=False, + ): + # Helper function that returns the quantization errors (MAE and MSE) when comparing the quantized LoRA model + # to the base model, vs the LoftQ quantized model to the base model. We expect the LoftQ quantized model to + # have less error than the normal LoRA quantized model. Since we compare logits, the observed error is + # already somewhat dampened because of the softmax. + torch.manual_seed(0) + model = self.get_base_model(model_id, device) + task_type = TaskType.SEQ_2_SEQ_LM if model.config.is_encoder_decoder else TaskType.CAUSAL_LM + inputs = self.get_input(model_id, device) + # the base logits are the reference, we try to match those as closely as possible + logits_base = self.get_logits(model, inputs) + # clean up + del model + clear_device_cache(garbage_collection=True) + + # logits from the normal quantized LoRA model + target_modules = "all-linear" if task_type != TaskType.SEQ_2_SEQ_LM else ["o", "k", "wi", "q", "v"] + lora_config = LoraConfig(task_type=task_type, use_dora=use_dora, target_modules=target_modules) + kwargs = {} + if bits == 4: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4") + elif bits == 8: + kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + else: + raise ValueError("bits must be 4 or 8") + + quantized_model = get_peft_model( + self.get_base_model(model_id, device, **kwargs), + lora_config, + ) + torch.manual_seed(0) + logits_quantized = self.get_logits(quantized_model, inputs) + del quantized_model + clear_device_cache(garbage_collection=True) + + # logits from quantized LoRA model using LoftQ + loftq_config = LoftQConfig(loftq_bits=bits, loftq_iter=loftq_iter) + lora_config = LoraConfig( + task_type=task_type, + init_lora_weights="loftq", + loftq_config=loftq_config, + use_dora=use_dora, + target_modules=target_modules, + ) + model = self.get_base_model(model_id, device) + if device != "cpu": + model = model.to(device) + loftq_model = get_peft_model(model, lora_config) + if device != "cpu": + loftq_model = loftq_model.to(device) + + # save LoRA weights, they should be initialized such that they minimize the quantization error + loftq_model.base_model.peft_config["default"].init_lora_weights = True + loftq_model.save_pretrained(tmp_path / "loftq_model") + + loftq_model = loftq_model.unload() + loftq_model.save_pretrained(tmp_path / "base_model") + + del loftq_model + clear_device_cache(garbage_collection=True) + + # now load quantized model and apply LoftQ-initialized weights on top + base_model = self.get_base_model(tmp_path / "base_model", device=device, **kwargs, torch_dtype=torch.float32) + loftq_model = PeftModel.from_pretrained(base_model, tmp_path / "loftq_model", is_trainable=True) + + # TODO sanity check: model is quantized + + torch.manual_seed(0) + logits_loftq = self.get_logits(loftq_model, inputs) + del loftq_model + clear_device_cache(garbage_collection=True) + + mae_quantized = torch.abs(logits_base - logits_quantized).mean() + mse_quantized = torch.pow(logits_base - logits_quantized, 2).mean() + mae_loftq = torch.abs(logits_base - logits_loftq).mean() + mse_loftq = torch.pow(logits_base - logits_loftq, 2).mean() + return mae_quantized, mse_quantized, mae_loftq, mse_loftq + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_4bit(self, device, tmp_path): + # In this test, we compare the logits of the base model, the quantized LoRA model, and the quantized model + # using LoftQ. When quantizing, we expect a certain level of error. However, we expect the LoftQ quantized + # model to have less error than the normal LoRA quantized model. Note that when using normal LoRA, the + # quantization error is simply the error from quantization without LoRA, as LoRA is a no-op before training. + # We still apply LoRA for the test for consistency. + + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors(bits=4, device=device, tmp_path=tmp_path) + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + assert mse_loftq < (mse_quantized / self.error_factor) + assert mae_loftq < (mae_quantized / self.error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_4bit_iter_5(self, device, tmp_path): + # Same test as the previous one but with 5 iterations. We should expect the error to be even smaller with more + # iterations, but in practice the difference is not that large, at least not for this small base model. + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=4, loftq_iter=5, device=device, tmp_path=tmp_path + ) + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mse_loftq < (mse_quantized / error_factor) + assert mae_loftq < (mae_quantized / error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_8bit(self, device, tmp_path): + # Same test as test_bloomz_loftq_4bit but with 8 bits. + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors(bits=8, device=device, tmp_path=tmp_path) + + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mse_loftq < (mse_quantized / error_factor) + assert mae_loftq < (mae_quantized / error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_8bit_iter_5(self, device, tmp_path): + # Same test as test_bloomz_loftq_4bit_iter_5 but with 8 bits. + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=8, loftq_iter=5, device=device, tmp_path=tmp_path + ) + + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mse_loftq < (mse_quantized / error_factor) + assert mae_loftq < (mae_quantized / error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_t5_loftq_4bit(self, device, tmp_path): + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=4, device=device, model_id="t5-small", tmp_path=tmp_path + ) + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mse_loftq < (mse_quantized / error_factor) + assert mae_loftq < (mae_quantized / error_factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_t5_loftq_8bit(self, device, tmp_path): + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=8, device=device, model_id="t5-small", tmp_path=tmp_path + ) + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mse_loftq < (mse_quantized / error_factor) + assert mae_loftq < (mae_quantized / error_factor) + + @pytest.mark.xfail # failing for now, but having DoRA pass is only a nice-to-have, not a must, so we're good + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_4bit_dora(self, device, tmp_path): + # same as test_bloomz_loftq_4bit but with DoRA + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=4, device=device, use_dora=True, tmp_path=tmp_path + ) + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + factor = 3 + assert mae_loftq < (mae_quantized / factor) + assert mse_loftq < (mse_quantized / factor) + + @pytest.mark.parametrize("device", [torch_device, "cpu"]) + def test_bloomz_loftq_8bit_dora(self, device, tmp_path): + # same as test_bloomz_loftq_8bit but with DoRA + mae_quantized, mse_quantized, mae_loftq, mse_loftq = self.get_errors( + bits=8, device=device, use_dora=True, tmp_path=tmp_path + ) + + # first, sanity check that all errors are > 0.0 + assert mae_quantized > 0.0 + assert mse_quantized > 0.0 + assert mae_loftq > 0.0 + assert mse_loftq > 0.0 + + # next, check that LoftQ quantization errors are smaller than LoRA errors by a certain margin + error_factor = self.get_error_factor(device) + assert mae_loftq < (mae_quantized / error_factor) + assert mse_loftq < (mse_quantized / error_factor) + + def test_replace_lora_weights_with_loftq_using_callable(self): + """ + Test replacing LoRa weights with LoFTQ using a callable. + + Using the replace_lora_weights_loftq function, we replace the LoRa weights of a bnb-quantized model with LoRA + weights initialized by LoftQ on the fly. We use a callable to decide whether to replace the weights or not. + This callable checks, for each weight, if replacing it would actually result in logits that are closer to the + original logits of the non-quantized model. + + """ + torch.manual_seed(0) + model_id = "bigscience/bloomz-560m" + device = torch_device + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer("The dog was", padding=True, return_tensors="pt").to(device) + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained(model_id).to(device) + logits_base = model(**inputs).logits + model.save_pretrained(tmp_dir) + + # load in 4bit + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + ) + model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config) + model = get_peft_model(model, LoraConfig(task_type="CAUSAL_LM", target_modules="all-linear")) + logits_lora = model(**inputs).logits + + current_mse = float("inf") + logs = [] + + def my_callback(model, module_name): + """Callable to replace weights with LoFTQ if the mse is lower than the current best one.""" + nonlocal current_mse + + logits = model(**inputs).logits + mse = ((logits_base - logits) ** 2).mean() + if mse < current_mse: + current_mse = mse + logs.append(True) + return True + logs.append(False) + return False + + replace_lora_weights_loftq(model, model_path=tmp_dir, callback=my_callback) + logits_loftq = model(**inputs).logits + + mae_lora = (logits_base - logits_lora).abs().mean() + mae_loftq = (logits_base - logits_loftq).abs().mean() + mse_lora = ((logits_base - logits_lora) ** 2).mean() + mse_loftq = ((logits_base - logits_loftq) ** 2).mean() + + # check that the error was reduced by a certain margin + assert mae_loftq * 1.5 < mae_lora + assert mse_loftq * 2.5 < mse_lora + + # check that the callback has returned some True and some False values + assert any(logs) + assert not all(logs) + + del model + clear_device_cache(garbage_collection=True) + + def test_replace_lora_weights_with_local_model(self): + # see issue 2020 + torch.manual_seed(0) + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + device = torch_device + + with tempfile.TemporaryDirectory() as tmp_dir: + # save base model locally + model = AutoModelForCausalLM.from_pretrained(model_id).to(device) + model.save_pretrained(tmp_dir) + del model + + # load in 4bit + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + ) + + # load the base model from local directory + model = AutoModelForCausalLM.from_pretrained(tmp_dir, quantization_config=bnb_config) + model = get_peft_model(model, LoraConfig()) + + # passing the local path directly works + replace_lora_weights_loftq(model, model_path=tmp_dir) + del model + + # load the base model from local directory + model = AutoModelForCausalLM.from_pretrained(tmp_dir, quantization_config=bnb_config) + model = get_peft_model(model, LoraConfig()) + + # when not passing, ensure that users are made aware of the `model_path` argument + with pytest.raises(ValueError, match="model_path"): + replace_lora_weights_loftq(model) + + del model + clear_device_cache(garbage_collection=True) + + def test_config_no_loftq_init(self): + with pytest.warns( + UserWarning, + match="`loftq_config` specified but will be ignored when `init_lora_weights` is not 'loftq'.", + ): + LoraConfig(loftq_config=LoftQConfig()) + + def test_config_no_loftq_config(self): + with pytest.raises(ValueError, match="`loftq_config` must be specified when `init_lora_weights` is 'loftq'."): + LoraConfig(init_lora_weights="loftq") + + +@require_bitsandbytes +@require_non_cpu +class MultiprocessTester(unittest.TestCase): + def test_notebook_launcher(self): + script_path = os.path.join("scripts", "launch_notebook_mp.py") + cmd = ["python", script_path] + with patch_environment(omp_num_threads=1): + run_command(cmd, env=os.environ.copy()) + + +@require_non_cpu +class MixedPrecisionTests(unittest.TestCase): + def setUp(self): + self.causal_lm_model_id = "facebook/opt-125m" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + self.config = LoraConfig( + r=16, + lora_alpha=32, + task_type="CAUSAL_LM", + ) + + data = load_dataset_english_quotes() + self.data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + gc.collect() + + @pytest.mark.single_gpu_tests + def test_model_using_float16_with_amp_raises(self): + # This test shows the issue with using a model in fp16 and then trying to use it with mixed precision training, + # which should not use fp16. + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + with pytest.raises(ValueError, match="Attempting to unscale FP16 gradients."): + trainer.train() + + @pytest.mark.single_gpu_tests + def test_model_using_float16_autocast_dtype(self): + # Here we use autocast_adapter_dtype=True (the default) to automatically promote the adapter weights to float32. + # No exception should be raised. + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + model = get_peft_model(model, self.config, autocast_adapter_dtype=True) + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.train() # does not raise + + @pytest.mark.single_gpu_tests + def test_model_using_float16_explicit_cast(self): + # Same test as above but containing the fix to make it work + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + + # here we manually promote the adapter weights to float32 + for param in model.parameters(): + if param.requires_grad: + param.data = param.data.float() + + dtype_counts_before = Counter(p.dtype for p in model.parameters()) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + + model = get_peft_model(model, self.config, autocast_adapter_dtype=True) + dtype_counts_after = Counter(p.dtype for p in model.parameters()) + assert dtype_counts_before == dtype_counts_after + + with tempfile.TemporaryDirectory() as tmp_dir: + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + max_steps=3, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.train() # does not raise + + @pytest.mark.single_gpu_tests + def test_load_model_using_float16_with_amp_raises(self): + # Same as previous tests, but loading the adapter with PeftModel.from_pretrained instead + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, torch_dtype=torch.float16) + model = PeftModel.from_pretrained(model, tmp_dir, autocast_adapter_dtype=False, is_trainable=True) + + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + with pytest.raises(ValueError, match="Attempting to unscale FP16 gradients."): + trainer.train() + + @pytest.mark.single_gpu_tests + def test_load_model_using_float16_autocast_dtype(self): + # Same as previous tests, but loading the adapter with PeftModel.from_pretrained instead + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + # Below, we purposefully set autocast_adapter_dtype=False so that the saved adapter uses float16. We still want + # the loaded adapter to use float32 when we load it with autocast_adapter_dtype=True. + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + # sanity check: this should have float16 adapter weights: + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["default"].weight.dtype + == torch.float16 + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, torch_dtype=torch.float16) + model = PeftModel.from_pretrained(model, tmp_dir, autocast_adapter_dtype=True, is_trainable=True) + # sanity check: this should NOT have float16 adapter weights: + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["default"].weight.dtype + == torch.float32 + ) + + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.train() # does not raise + + @pytest.mark.single_gpu_tests + def test_load_adapter_using_float16_autocast_dtype(self): + # Here we test the load_adapter method with autocast_adapter_dtype. We show that autocasting is prevented when + # calling load_model(..., autocast_adapter_dtype=False) and that it is enabled when calling + # load_model(..., autocast_adapter_dtype=True) (the default). + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + torch_dtype=torch.float16, + ) + # Below, we purposefully set autocast_adapter_dtype=False so that the saved adapter uses float16. We still want + # the loaded adapter to use float32 when we load it with autocast_adapter_dtype=True. + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + # sanity check: this should have float16 adapter weights: + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["default"].weight.dtype + == torch.float16 + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + model = AutoModelForCausalLM.from_pretrained(self.causal_lm_model_id, torch_dtype=torch.float16) + # the default adapter is now in float16 + model = get_peft_model(model, self.config, autocast_adapter_dtype=False) + # sanity check: this should NOT have float16 adapter weights: + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["default"].weight.dtype + == torch.float16 + ) + + # now load the first adapter in float16 using the adapter name "loaded16" + model.load_adapter(tmp_dir, "loaded16", autocast_adapter_dtype=False) + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["loaded16"].weight.dtype + == torch.float16 + ) + + # now load the first adapter in float32 using the adapter name "loaded32" + model.load_adapter(tmp_dir, "loaded32", autocast_adapter_dtype=True) + assert ( + model.base_model.model.model.decoder.layers[0].self_attn.v_proj.lora_A["loaded32"].weight.dtype + == torch.float32 + ) + + # training with the default adapter, which is in float16, should raise + model.set_adapter("default") + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + with pytest.raises(ValueError, match="Attempting to unscale FP16 gradients."): + trainer.train() + + # training the model with the adapter "loaded16", which is in float16, should also raise + model.set_adapter("loaded16") + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + with pytest.raises(ValueError, match="Attempting to unscale FP16 gradients."): + trainer.train() + + # training the model with the adapter "loaded32", which is in float32, should not raise + model.set_adapter("loaded32") + trainer = Trainer( + model=model, + train_dataset=self.data["train"], + args=TrainingArguments( + fp16=True, # <= this is required for the error to be raised + output_dir=tmp_dir, + max_steps=3, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.train() # does not raise + + +@require_non_xpu +@require_torch_gpu +@require_aqlm +@unittest.skipUnless( + version.parse(importlib.metadata.version("transformers")) >= version.parse("4.38.0"), + "test requires `transformers>=4.38.0`", +) +class PeftAqlmGPUTests(unittest.TestCase): + r""" + AQLM + peft tests + """ + + def setUp(self): + self.causal_lm_model_id = "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_aqlm(self): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="cuda", + torch_dtype="auto", + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + fp16=True, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + +@require_non_xpu +@require_torch_gpu +@require_hqq +@unittest.skipUnless( + version.parse(importlib.metadata.version("transformers")) >= version.parse("4.36.1"), + "test requires `transformers>=4.36.1`", +) +class PeftHqqGPUTests(unittest.TestCase): + r""" + HQQ + peft tests + """ + + def setUp(self): + self.causal_lm_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + @pytest.mark.single_gpu_tests + @parameterized.expand([False, True]) + def test_causal_lm_training_hqq(self, use_dora): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + + from transformers import HqqConfig + + with tempfile.TemporaryDirectory() as tmp_dir: + device = "cuda" + compute_dtype = torch.float16 + + quant_config = HqqConfig(nbits=4, group_size=64) + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device, + torch_dtype=compute_dtype, + quantization_config=quant_config, + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=use_dora, + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + fp16=True, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_hqq_lora_model_outputs(self): + # check that the outputs generated by HQQ with LoRA are similar to those without HQQ + from transformers import HqqConfig + + device = "cuda" + compute_dtype = torch.float16 + min_correlation = 0.96 + + # first load the model without HQQ + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device, + torch_dtype=compute_dtype, + ) + config = LoraConfig( + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + init_lora_weights=False, + ) + torch.manual_seed(0) + model = get_peft_model(model, config).eval() + inputs = self.tokenizer("The meaning of unit tests is", return_tensors="pt").to(model.device) + + with torch.inference_mode(): + output_normal = model(**inputs).logits + assert torch.isfinite(output_normal).all() + + del model + clear_device_cache(garbage_collection=True) + + # now load with HQQ + quant_config = HqqConfig(nbits=4, group_size=64) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device, + torch_dtype=compute_dtype, + quantization_config=quant_config, + ) + torch.manual_seed(0) + model = get_peft_model(model, config).eval() + with torch.inference_mode(): + output_hqq = model(**inputs).logits + + # check that outputs of HQQ are highly correlated; there are outliers, so don't check for equality + cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_hqq.float().flatten()))) + assert cc_matrix.min() > min_correlation + + # check that outputs are the same after merging + cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_hqq.float().flatten()))) + assert cc_matrix.min() > min_correlation + + # check outputs are the same after unmerging + model.unmerge_adapter() + with torch.inference_mode(): + output_unmerged = model(**inputs).logits + cc_matrix = torch.corrcoef(torch.stack((output_normal.float().flatten(), output_unmerged.float().flatten()))) + assert cc_matrix.min() > min_correlation + + # check that the results are the same after saving and loading + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + del model + clear_device_cache(garbage_collection=True) + + quant_config = HqqConfig(nbits=4, group_size=64) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device, + torch_dtype=compute_dtype, + quantization_config=quant_config, + ) + model = PeftModel.from_pretrained(model, tmp_dir) + with torch.inference_mode(): + output_loaded = model(**inputs).logits + + # for loading, we expect high precision, so check for equality and not just correlation + atol, rtol = 1e-6, 1e-6 + assert torch.allclose(output_hqq, output_loaded, atol=atol, rtol=rtol) + + # check that outputs are the same after merge_and_unload + model = model.merge_and_unload() + with torch.inference_mode(): + output_merged_unloaded = model(**inputs).logits + cc_matrix = torch.corrcoef( + torch.stack((output_normal.float().flatten(), output_merged_unloaded.float().flatten())) + ) + assert cc_matrix.min() > min_correlation + + +@require_non_cpu +@require_auto_awq +class PeftAwqGPUTests(unittest.TestCase): + r""" + Awq + peft tests + + Note that AWQ is no longer being maintained: + + https://github.com/casper-hansen/AutoAWQ/blob/88e4c76b20755db275574e6a03c83c84ba3bece5/README.md + + It is therefore expected that more tests will start failing in the future. If this happens, remove AWQ support from + PEFT. + """ + + def setUp(self): + self.causal_lm_model_id = "peft-internal-testing/opt-125m-awq" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free accelerator memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_awq(self): + r""" + Test the CausalLM training on a single accelerator. The test would simply fail if the adapters are not set + correctly. + """ + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map="auto", + ) + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + # TODO: deal correctly with this case in transformers + model._is_quantized_training_enabled = True + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + fp16=True, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + # TODO remove marker if/once issue is resolved, most likely requiring a fix in AutoAWQ: + # https://github.com/casper-hansen/AutoAWQ/issues/754 + @pytest.mark.xfail( + condition=is_torch_version(">=", "2.7.0"), + reason="Multi-GPU test currently not working with AutoAWQ and PyTorch 2.7+", + strict=True, + ) + @require_torch_multi_accelerator + def test_causal_lm_training_multi_accelerator(self): + r""" + Test the CausalLM training on a multi-accelerator device. The test would simply fail if the adapters are not + set correctly. + """ + device_map = { + "model.decoder.embed_tokens": 0, + "lm_head": 0, + "model.decoder.embed_positions": 0, + "model.decoder.project_out": 0, + "model.decoder.project_in": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "model.decoder.final_layer_norm": 1, + } + + with tempfile.TemporaryDirectory() as tmp_dir: + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device_map, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + +@require_non_xpu +@require_torch_gpu +@require_eetq +class PeftEetqGPUTests(unittest.TestCase): + r""" + EETQ + peft tests + """ + + def setUp(self): + self.causal_lm_model_id = "facebook/opt-125m" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + def _check_inference_finite(self, model, batch): + # try inference without Trainer class + training = model.training + model.eval() + output = model(**batch.to(model.device)) + assert torch.isfinite(output.logits).all() + model.train(training) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_eetq(self): + r""" + Test the CausalLM training on a single GPU device. The test would simply fail if the adapters are not set + correctly. + """ + from transformers import EetqConfig + + with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = EetqConfig("int8") + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map="auto", quantization_config=quantization_config + ) + + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_gpu + def test_causal_lm_training_multi_gpu_eetq(self): + r""" + Test the CausalLM training on a multi-GPU device. The test would simply fail if the adapters are not set + correctly. + """ + from transformers import EetqConfig + + with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = EetqConfig("int8") + + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=DEVICE_MAP_MAP[self.causal_lm_model_id], + quantization_config=quantization_config, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + + setattr(model, "model_parallel", True) + setattr(model, "is_parallelizable", True) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + model.config.use_cache = False + trainer.train() + + model.cpu().save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + +@require_non_cpu +@require_torchao +class PeftTorchaoGPUTests(unittest.TestCase): + r""" + torchao + peft tests + """ + + supported_quant_types = [ + "int8_weight_only", + "int8_dynamic_activation_int8_weight", + # int4_weight_only raises an error: + # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented + # "int4_weight_only", + ] + + def setUp(self): + self.causal_lm_model_id = "facebook/opt-125m" + self.tokenizer = AutoTokenizer.from_pretrained(self.causal_lm_model_id) + # torchao breaks with fp16 and if a previous test uses fp16, transformers will set this env var, which affects + # subsequent tests, therefore the env var needs to be cleared explicitly + # + # TODO: remove this once https://github.com/huggingface/transformers/pull/39483 is merged + os.environ.pop("ACCELERATE_MIXED_PRECISION", None) + + def tearDown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + + @parameterized.expand(supported_quant_types) + @pytest.mark.single_gpu_tests + def test_causal_lm_training_single_gpu_torchao(self, quant_type): + from transformers import TorchAoConfig + + device = 0 + + with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = TorchAoConfig(quant_type=quant_type) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.model.config.use_cache = False + trainer.train() + + model.save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_single_gpu_torchao_dora_int8_weight_only(self): + from transformers import TorchAoConfig + + device = 0 + + with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = TorchAoConfig(quant_type="int8_weight_only") + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.model.config.use_cache = False + trainer.train() + + model.save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_single_gpu_torchao_dora_int8_dynamic_activation_int8_weight_raises(self): + from transformers import TorchAoConfig + + device = 0 + + quantization_config = TorchAoConfig(quant_type="int8_dynamic_activation_int8_weight") + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + use_dora=True, + ) + with pytest.raises(NotImplementedError): + get_peft_model(model, config) + + @pytest.mark.single_gpu_tests + def test_causal_lm_training_single_gpu_torchao_int4_raises(self): + # int4_weight_only raises an error: + # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented + # TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types + from transformers import TorchAoConfig + + device = 0 + + quantization_config = TorchAoConfig(quant_type="int4_weight_only") + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ) + model = prepare_model_for_kbit_training(model) + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + msg = re.escape("TorchaoLoraLinear only supports int8 weights for now") + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + @parameterized.expand(supported_quant_types) + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_causal_lm_training_multi_accelerator_torchao(self, quant_type): + from transformers import TorchAoConfig + + device_map = { + "model.decoder.embed_tokens": 0, + "lm_head": 0, + "model.decoder.embed_positions": 0, + "model.decoder.project_out": 0, + "model.decoder.project_in": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "model.decoder.final_layer_norm": 1, + } + + with tempfile.TemporaryDirectory() as tmp_dir: + quantization_config = TorchAoConfig(quant_type=quant_type) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device_map, + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + model.model_parallel = True + model.is_parallelizable = True + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + data = load_dataset_english_quotes() + data = data.map(lambda samples: self.tokenizer(samples["quote"]), batched=True) + + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=TrainingArguments( + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + warmup_steps=2, + max_steps=3, + learning_rate=2e-4, + logging_steps=1, + output_dir=tmp_dir, + ), + data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False), + ) + trainer.model.config.use_cache = False + trainer.train() + + model.save_pretrained(tmp_dir) + + assert "adapter_config.json" in os.listdir(tmp_dir) + assert SAFETENSORS_WEIGHTS_NAME in os.listdir(tmp_dir) + + # assert loss is not None + assert trainer.state.log_history[-1]["train_loss"] is not None + + @pytest.mark.multi_gpu_tests + @require_torch_multi_accelerator + def test_causal_lm_training_multi_accelerator_torchao_int4_raises(self): + # int4_weight_only raises an error: + # RuntimeError: derivative for aten::_weight_int4pack_mm is not implemented + # TODO: Once proper torchao support for int4 is added, remove this test and add int4 to supported_quant_types + from transformers import TorchAoConfig + + device_map = { + "model.decoder.embed_tokens": 0, + "lm_head": 0, + "model.decoder.embed_positions": 0, + "model.decoder.project_out": 0, + "model.decoder.project_in": 0, + "model.decoder.layers.0": 0, + "model.decoder.layers.1": 0, + "model.decoder.layers.2": 0, + "model.decoder.layers.3": 0, + "model.decoder.layers.4": 0, + "model.decoder.layers.5": 0, + "model.decoder.layers.6": 1, + "model.decoder.layers.7": 1, + "model.decoder.layers.8": 1, + "model.decoder.layers.9": 1, + "model.decoder.layers.10": 1, + "model.decoder.layers.11": 1, + "model.decoder.final_layer_norm": 1, + } + quantization_config = TorchAoConfig(quant_type="int4_weight_only") + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, + device_map=device_map, + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, + ) + + assert set(model.hf_device_map.values()) == set(range(device_count)) + assert {p.device.index for p in model.parameters()} == set(range(device_count)) + + model = prepare_model_for_kbit_training(model) + model.model_parallel = True + model.is_parallelizable = True + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + msg = re.escape("TorchaoLoraLinear only supports int8 weights for now") + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + @pytest.mark.single_gpu_tests + def test_torchao_merge_layers_int8_weight_only(self): + from torchao.dtypes import AffineQuantizedTensor + from transformers import TorchAoConfig + + quant_type = "int8_weight_only" + torch.manual_seed(0) + device = 0 + dummy_input = torch.arange(10).view(-1, 1).to(device) + + quantization_config = TorchAoConfig(quant_type=quant_type) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ).eval() + logits_base = model(dummy_input)[0] + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + model.eval() + logits = model(dummy_input)[0] + + # sanity check: outputs changed + # precision is quite low, so we need to use high atol and rtol + atol, rtol = 1e-1, 1e-1 + assert not torch.allclose(logits, logits_base, atol=atol, rtol=rtol) + + model.merge_adapter() + logits_merged = model(dummy_input)[0] + for name, module in model.named_modules(): + if "base_layer" in name: + assert isinstance(module.weight, AffineQuantizedTensor) + + model.unmerge_adapter() + logits_unmerged = model(dummy_input)[0] + for name, module in model.named_modules(): + if "base_layer" in name: + assert isinstance(module.weight, AffineQuantizedTensor) + + model = model.merge_and_unload() + logits_merged_unloaded = model(dummy_input)[0] + + assert torch.allclose(logits, logits_merged, atol=atol, rtol=rtol) + assert torch.allclose(logits, logits_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(logits, logits_merged_unloaded, atol=atol, rtol=rtol) + + @pytest.mark.single_gpu_tests + def test_torchao_merge_layers_int8_dynamic_activation_int8_weight_raises(self): + # int8_dynamic_activation_int8_weight does not support dequantize, thus merging does not work + from transformers import TorchAoConfig + + quant_type = "int8_dynamic_activation_int8_weight" + torch.manual_seed(0) + device = 0 + + quantization_config = TorchAoConfig(quant_type=quant_type) + model = AutoModelForCausalLM.from_pretrained( + self.causal_lm_model_id, device_map=device, quantization_config=quantization_config + ).eval() + + config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + init_lora_weights=False, + ) + model = get_peft_model(model, config) + + msg = re.escape( + "Weights of type LinearActivationQuantizedTensor do not support dequantization (yet), which is needed to " + "support merging." + ) + with pytest.raises(NotImplementedError, match=msg): + model.merge_adapter() + + +PRECISIONS = [(torch.float32), (torch.float16), (torch.bfloat16)] + +LORA_PARAMS = { + "r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, +} + + +class SimpleModel(torch.nn.Module): + def __init__(self): + super().__init__() + + self.embedding_layer = torch.nn.Embedding(1000, 768) + self.layer_norm = torch.nn.LayerNorm(768) + self.linear_transform = torch.nn.Linear(768, 256) + + def forward(self, input_ids): + embedded_output = self.embedding_layer(input_ids) + norm_output = self.layer_norm(embedded_output) + linear_output = self.linear_transform(norm_output) + + return linear_output + + +class SimpleConv2DModel(torch.nn.Module): + def __init__(self): + super().__init__() + + self.embedding_layer = torch.nn.Embedding(1000, 768) + self.layer_norm = torch.nn.LayerNorm(768) + self.conv2d_transform = torch.nn.Conv2d(1, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + + def forward(self, input_ids): + # Additional layers for your custom model + embedded_output = self.embedding_layer(input_ids) + norm_output = self.layer_norm(embedded_output) + + # Reshape for Conv2d input (add batch size dimension) + norm_output = norm_output.unsqueeze(1) + conv_output = self.conv2d_transform(norm_output) + + # Remove batch size dimension + conv_output = conv_output.squeeze(1) + + return conv_output + + +@require_non_cpu +class TestAutoCast(unittest.TestCase): + device = infer_device() + + # This test makes sure, that Lora dtypes are consistent with the types + # infered by torch.autocast under tested PRECISIONS + @parameterized.expand(PRECISIONS) + def test_simple_model(self, *args, **kwargs): + self._test_model(SimpleModel(), *args, **kwargs) + + @parameterized.expand(PRECISIONS) + def test_simple_lora_linear_model(self, *args, **kwargs): + simple_model = SimpleModel() + config = LoraConfig( + **LORA_PARAMS, + target_modules=["linear_transform"], + ) + + lora_model = get_peft_model(simple_model, config) + + self._test_model(lora_model, *args, **kwargs) + + @parameterized.expand(PRECISIONS) + def test_simple_lora_embedding_model(self, *args, **kwargs): + simple_model = SimpleModel() + config = LoraConfig( + **LORA_PARAMS, + target_modules=["embedding_layer"], + ) + lora_model = get_peft_model(simple_model, config) + + self._test_model(lora_model, *args, **kwargs) + + @parameterized.expand(PRECISIONS) + def test_simple_conv2d_model(self, *args, **kwargs): + self._test_model(SimpleConv2DModel(), *args, **kwargs) + + @parameterized.expand(PRECISIONS) + def test_simple_lora_conv2d_model(self, *args, **kwargs): + simple_model = SimpleConv2DModel() + config = LoraConfig( + **LORA_PARAMS, + target_modules=["conv2d_transform"], + ) + lora_model = get_peft_model(simple_model, config) + self._test_model(lora_model, *args, **kwargs) + + def _test_model(self, model, precision): + # Move model to GPU + model = model.to(self.device) + + # Prepare dummy inputs + input_ids = torch.randint(0, 1000, (2, 10)).to(self.device) + if precision == torch.bfloat16: + if not is_bf16_available(): + self.skipTest("Bfloat16 not supported on this device") + + # Forward pass with test precision + with torch.autocast(enabled=True, dtype=precision, device_type=self.device): + outputs = model(input_ids) + assert outputs.dtype == precision + + +class TestFSDPWrap: + """ + Test that we can successfully initialize an FSDP instance of the module. + + This is a very simple test, as it does not perform actual FSDP training. Here we just ensure that the FSDP instance + can be created. This can fail for several reasons, e.g. int dtype from BNB or inconsistent requires_grad settings + due to the auto wrap policy. + + """ + + @pytest.mark.single_gpu_tests + @require_bitsandbytes + def test_bnb_4bit_wrap_fsdp(self): + quant_config = BitsAndBytesConfig( + load_in_4bit=True, + # float32 must be used, or else FSDP will complain about mixed int and float dtypes + bnb_4bit_compute_dtype=torch.float32, + bnb_4bit_quant_storage=torch.float32, + bnb_4bit_use_double_quant=True, + ) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=quant_config, + torch_dtype=torch.float32, + ) + # model = prepare_model_for_kbit_training(model) + config = LoraConfig( + target_modules=["q_proj", "v_proj"], + task_type="CAUSAL_LM", + use_dora=True, + ) + model = get_peft_model(model, config) + + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29501" + + init_process_group(world_size=1, rank=0) + # check that this does not raise: + FSDP(model, auto_wrap_policy=fsdp_auto_wrap_policy(model), use_orig_params=False, sync_module_states=True) + + def test_fsdp_auto_wrap_policy_does_not_raise_on_custom_model(self): + # See #2167 + # Avoid raising on custom models since Trainer uses fsdp_auto_wrap_policy automatically for PEFT + FSDP + fsdp_auto_wrap_policy(SimpleModel()) # does not raise + + +class TestBOFT: + """ + Test that we can correctly use half-precision models with BOFT. + """ + + device = infer_device() + + @require_non_cpu + @pytest.mark.single_gpu_tests + def test_boft_half_linear(self): + # Check that we can use BoFT with model loaded in half precision + layer = torch.nn.Linear(160, 160).to(self.device) + layer = boft.layer.Linear(layer, "layer", boft_n_butterfly_factor=2).to(dtype=torch.bfloat16) + x = torch.randn(160, 160, device=self.device, dtype=torch.bfloat16) + layer(x) # does not raise + + @require_non_cpu + @pytest.mark.single_gpu_tests + def test_boft_half_conv(self): + conv = torch.nn.Conv2d(1, 1, 4).to(self.device) + conv = boft.layer.Conv2d(conv, "conv", boft_n_butterfly_factor=2).to(dtype=torch.bfloat16) + x = torch.randn(1, 160, 160, device=self.device, dtype=torch.bfloat16) + conv(x) # does not raise + + +class TestPTuningReproducibility: + device = infer_device() + + @require_non_cpu + @require_deterministic_for_xpu + def test_p_tuning_exactly_reproducible_after_loading(self, tmp_path): + # See: https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577 + # Ensure that after loading a p-tuning checkpoint, results are exactly reproducible (before the patch, they were + # only _almost_ identical). + + # The model must be sufficiently large for the effect to be measurable, which is why this test requires is not + # run on CPU. + model_id = "facebook/opt-125m" + inputs = torch.arange(10).view(-1, 1).to(self.device) + + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128) + model = get_peft_model(model, peft_config).eval() + + with torch.inference_mode(): + output_peft = model(inputs).logits + gen_peft = model.generate(inputs, min_new_tokens=10, max_new_tokens=10) + + model.save_pretrained(tmp_path) + del model + clear_device_cache(garbage_collection=True) + + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.device) + model = PeftModel.from_pretrained(model, tmp_path) + + with torch.inference_mode(): + output_loaded = model(inputs).logits + gen_loaded = model.generate(inputs, min_new_tokens=10, max_new_tokens=10) + + torch.testing.assert_close(output_loaded, output_peft) + torch.testing.assert_close(gen_loaded, gen_peft) + + +@pytest.mark.single_gpu_tests +class TestLowCpuMemUsageDifferentDevices: + """Test for the low CPU memory usage option for loading PEFT models. + + There are already tests for low_cpu_mem_usage=True in test_initialization.py but here we want to run tests that + require a GPU. + + """ + + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + device = infer_device() + + @require_non_cpu + @pytest.mark.parametrize("device_model, device_sd", [("cpu", infer_device()), (infer_device(), "cpu")]) + def test_low_cpu_mem_usage_model_model_on_gpu_state_dict_on_cpu_works(self, device_model, device_sd): + # specifically test diverging devices for the model and state_dict + inputs = {"input_ids": torch.randint(0, 100, (1, 10)), "attention_mask": torch.ones(1, 10)} + inputs = {k: v.to(device_model) for k, v in inputs.items()} + + model = AutoModelForCausalLM.from_pretrained(self.model_id).to(device_model) + lora_config = LoraConfig(init_lora_weights=False, target_modules="all-linear") + model = get_peft_model(model, lora_config) + model.eval() + logits_not_low_cpu_mem = model(**inputs).logits + + state_dict = get_peft_model_state_dict(model) + peft_model_state_dict = {} + # remap the state dict so that it can be correctly loaded, and move weights to the other device + prefix = "base_model.model." + for k, v in state_dict.items(): + k = k[len(prefix) :] + peft_model_state_dict[k] = v.to(device_sd) + + del model + + model = AutoModelForCausalLM.from_pretrained(self.model_id).to(device_model) + model.eval() + inject_adapter_in_model(lora_config, model, low_cpu_mem_usage=True) + load_result = set_peft_model_state_dict(model, peft_model_state_dict, low_cpu_mem_usage=True) + + # sanity check: all lora keys are matched + assert not any("lora" in k for k in load_result.missing_keys) + assert not any("lora" in k for k in load_result.unexpected_keys) + + logits_low_cpu_mem = model(**inputs).logits + + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem) + assert {p.device.type for p in model.parameters()} == {device_model} + + @require_bitsandbytes + @pytest.mark.parametrize("quantization_method", ["bnb-4bit", "bnb-8bit"]) + def test_low_cpu_mem_usage_with_quantization(self, quantization_method): + # Ensure that low_cpu_mem_usage works with quantization + # See also https://github.com/huggingface/diffusers/issues/10550 + if quantization_method == "bnb-4bit": + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float32, + bnb_4bit_quant_storage=torch.float32, + bnb_4bit_use_double_quant=True, + ) + elif quantization_method == "bnb-8bit": + quantization_config = BitsAndBytesConfig(load_in_8bit=True) + else: + raise ValueError(f"Unknown quantization method {quantization_method}") + + model = AutoModelForCausalLM.from_pretrained(self.model_id, quantization_config=quantization_config) + if model.device.type != self.device: + # calling model.to("cuda") with 8 bit bnb raises an error, thus guard against it + model = model.to(self.device) + + lora_config = LoraConfig(init_lora_weights=False, target_modules="all-linear") + + # We use get_peft_model with low_cpu_mem_usage=True here. This is not typically done in practice (the option is + # mostly interesting for loading trained adapters), but it does the job for testing purposes. + model = get_peft_model(model, lora_config, low_cpu_mem_usage=True) # this should not raise + assert {p.device.type for p in model.parameters()} == {self.device, "meta"} + + +class TestEvaInitializationGPU: + """GPU tests for the Eva initialization method.""" + + # Constants for test configuration + COSINE_SIMILARITY_THRESHOLD = 0.75 + NUM_SEEDS = 3 + BATCH_SIZE = 4 + MAX_LENGTH = 256 + LORA_DIM = 8 + LORA_ALPHA = 1 + DEVICE = infer_device() + + @pytest.fixture + def tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + @pytest.fixture + def dataset(self, tokenizer): + dataset = load_dataset_english_quotes()["train"] + # concatenate examples + examples = [] + example = "" + for data in dataset: + if len(example) >= self.MAX_LENGTH: + examples.append(example) + example = "" + example = example + " " + data["quote"] + dataset = Dataset.from_dict({"text": examples}) + # tokenize + dataset = dataset.map( + lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=self.MAX_LENGTH), + batched=True, + remove_columns=dataset.column_names, + ) + dataset.set_format(type="torch") + return dataset + + @pytest.fixture + def model(self): + model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") + model.transformer.h = model.transformer.h[:2] # truncate to 2 layers + return model.to(self.DEVICE) + + @pytest.fixture + def model_bnb(self): + bnb_config = BitsAndBytesConfig(load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained( + "openai-community/gpt2", + quantization_config=bnb_config, + attn_implementation="eager", # gpt2 doesnt support flash attention + ) + model.transformer.h = model.transformer.h[:2] # truncate to 2 layers + model = prepare_model_for_kbit_training(model) + return model + + @pytest.fixture + def model_fixture(self, request): + return request.getfixturevalue(request.param) + + @pytest.fixture + def peft_config(self): + return LoraConfig( + r=self.LORA_DIM, + lora_alpha=self.LORA_ALPHA, + target_modules=["c_attn"], + init_lora_weights="eva", + eva_config=EvaConfig(rho=2), + ) + + def is_bnb_model(self, model): + return hasattr(model.config, "quantization_config") + + @staticmethod + def collate_fn(examples): + return {k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys()} + + @require_non_cpu + @require_bitsandbytes + @pytest.mark.single_gpu_tests + @pytest.mark.parametrize("model_fixture", ["model", "model_bnb"], indirect=True) + def test_eva_initialization_consistency(self, model_fixture, dataset, peft_config): + """Test that the state dict returned by get_eva_state_dict loaded correctly and is consistent across different seeds based + on the cosine similarity of the svd components.""" + state_dicts = [] + for seed in range(self.NUM_SEEDS): + shuffled_dataset = dataset.shuffle(seed=seed) + dataloader = DataLoader( + shuffled_dataset, + batch_size=self.BATCH_SIZE, + collate_fn=lambda examples: { + k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys() + }, + shuffle=False, + ) + peft_model = get_peft_model(deepcopy(model_fixture), peft_config) + initialize_lora_eva_weights(peft_model, dataloader) + state_dicts.append( + {k: v.cpu() for k, v in peft_model.state_dict().items() if "lora_A.default.weight" in k} + ) + + cos_sims = defaultdict(list) + for i, j in itertools.combinations(range(self.NUM_SEEDS), 2): + for k, v1 in state_dicts[i].items(): + v2 = state_dicts[j][k] + min_size = min(v1.size(0), v2.size(0)) + cos_sims[k].extend(torch.cosine_similarity(v1[:min_size], v2[:min_size], dim=1).abs().tolist()) + + mean_cosine_similarities = {k: torch.tensor(v).mean() for k, v in cos_sims.items()} + for layer_name, mean_cosine_similarity in mean_cosine_similarities.items(): + assert mean_cosine_similarity > self.COSINE_SIMILARITY_THRESHOLD, ( + f"Mean absolute cosine similarity {mean_cosine_similarity:.4f} " + f"is not greater than {self.COSINE_SIMILARITY_THRESHOLD}" + ) + + +class TestALoRAInferenceGPU: + """GPU inference for Activated LoRA.""" + + # Constants for test configuration + NUM_SEEDS = 3 + LORA_DIM = 8 + LORA_ALPHA = 1 + DEVICE = infer_device() + + @pytest.fixture + def tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + @pytest.fixture + def model(self): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + return model.to(self.DEVICE) + + @pytest.fixture + def model_bnb(self): + bnb_config = BitsAndBytesConfig(load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + ) + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + model = prepare_model_for_kbit_training(model) + return model + + @pytest.fixture + def peft_config(self): + return LoraConfig( + r=self.LORA_DIM, + task_type="CAUSAL_LM", + lora_alpha=self.LORA_ALPHA, + target_modules=["q_proj"], + alora_invocation_tokens=[2], # id for
+ init_lora_weights=False, + ) + + @require_non_cpu + @require_bitsandbytes + @pytest.mark.single_gpu_tests + def test_alora_forward_consistency(self, model, model_bnb, peft_config): + """Test that the forwards of the model with adapter are similar across quantizations.""" + for seed in range(self.NUM_SEEDS): + torch.manual_seed(seed) + # random.seed(seed) + np.random.seed(seed) + peft_model = get_peft_model(deepcopy(model), peft_config) + torch.manual_seed(seed) + # random.seed(seed) + np.random.seed(seed) + peft_model_bnb = get_peft_model(deepcopy(model_bnb), peft_config) + peft_model.eval() + peft_model_bnb.eval() + input_ids = torch.tensor([[0, 1, 2, 3]]).to(self.DEVICE) + with torch.no_grad(): + peft_out = peft_model(input_ids=input_ids, return_dict=True, output_hidden_states=True) + peft_out_bnb = peft_model_bnb(input_ids=input_ids, return_dict=True, output_hidden_states=True) + h_fp = peft_out.hidden_states[-1] + h_4b = peft_out_bnb.hidden_states[-1] + a = h_fp.detach().to(torch.float32).cpu() + b = h_4b.detach().to(torch.float32).cpu() + import torch.nn.functional as F + + cos = F.cosine_similarity(a.flatten(), b.flatten(), dim=0).item() + assert cos > 0.9 + + +@pytest.mark.multi_gpu_tests +class TestPrefixTuning: + device = infer_device() + + @require_torch_multi_accelerator + def test_prefix_tuning_multiple_devices_decoder_model(self): + # See issue 2134 + model_id = "hf-internal-testing/tiny-random-MistralForCausalLM" + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(self.device) + + device_map = { + "model.embed_tokens": 0, + "model.layers.0": 0, + "model.layers.1": 1, + "model.norm": 1, + "model.rotary_emb": 1, + "lm_head": 1, + } + model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device_map) + # sanity check, as the test passes trivially for a single device + assert len({p.device for p in model.parameters()}) > 1 + # sanity check: this should work without peft + model.generate(**inputs) # does not raise + + peft_config = PrefixTuningConfig(num_virtual_tokens=10, task_type="CAUSAL_LM") + model = get_peft_model(model, peft_config) + model.generate(**inputs) # does not raise + + @require_torch_multi_accelerator + def test_prefix_tuning_multiple_devices_encoder_decoder_model(self): + # See issue 2134 + model_id = "hf-internal-testing/tiny-random-T5Model" + tokenizer = AutoTokenizer.from_pretrained(model_id, padding="left") + inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(self.device) + device_map = { + "shared": 0, + "encoder.embed_tokens": 0, + "encoder.block.0": 0, + "encoder.block.1": 0, + "encoder.block.2": 1, + "encoder.block.3": 1, + "encoder.block.4": 1, + "encoder.final_layer_norm": 1, + "decoder.embed_tokens": 0, + "decoder.block.0": 0, + "decoder.block.1": 0, + "decoder.block.2": 1, + "decoder.block.3": 1, + "decoder.block.4": 1, + "decoder.final_layer_norm": 1, + "lm_head": 0, + } + model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=device_map) + # sanity check, as the test passes trivially for a single device + assert len({p.device for p in model.parameters()}) > 1 + # sanity check: this should work without peft + model.generate(**inputs) # does not raise + + peft_config = PrefixTuningConfig(num_virtual_tokens=10, task_type="SEQ_2_SEQ_LM") + model = get_peft_model(model, peft_config) + model.generate(**inputs) # does not raise + + +@pytest.mark.skipif(not (torch.cuda.is_available() or is_xpu_available()), reason="test requires a GPU or XPU") +@pytest.mark.single_gpu_tests +class TestHotSwapping: + """ + Test hotswapping on compiled models. + + This test suite is only run on GPU as it is quite slow. + """ + + torch_device = infer_device() + + @pytest.fixture(scope="class", autouse=True) + def reset_float32_matmul_precision(self): + # Earlier tests may run torchao, which, at the time this was added, sets the float32 matmul precision to 'high'. + # This in turn results in some models producing different outputs when compiled (but only for some seeds). + # Therefore, we need to ensure that the precision is reset to "highest", which is the default. + # TODO: if torchao removes the side effect, this fixture can be deleted. + # https://github.com/pytorch/ao/blob/ffb4350640e76c7e7f449dd1e36d33f19fe384c8/torchao/quantization/utils.py#L589 + torch.set_float32_matmul_precision("highest") + + @pytest.fixture(autouse=True) + def reset_dynamo_cache(self): + # It is critical that the dynamo cache is reset for each test. Otherwise, if the test re-uses the same model, + # there will be recompilation errors, as torch caches the model when run in the same process. + yield + torch._dynamo.reset() + + ####### + # LLM # + ####### + + def check_hotswap(self, do_hotswap, ranks, alpha_scalings): + """ + Test hotswapping with a compiled model. + + Passing do_hotswap=False should trigger recompilation. Use the raise_error_on_recompile context manager to + raise an error when recompilation occurs. + + """ + torch.manual_seed(0) + inputs = torch.arange(10).view(-1, 1).to(self.torch_device) + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.torch_device) + rank0, rank1 = ranks + alpha0, alpha1 = alpha_scalings + + # note that the 2nd adapter targeting a subset of the 1st adapter is okay, but not the other way round + config0 = LoraConfig(init_lora_weights=False, r=rank0, lora_alpha=alpha0, target_modules=["q_proj", "v_proj"]) + config1 = LoraConfig(init_lora_weights=False, r=rank1, lora_alpha=alpha1, target_modules=["q_proj"]) + model = get_peft_model(model, config0, adapter_name="adapter0").eval() + with torch.inference_mode(): + output0 = model(inputs).logits + + model.add_adapter("adapter1", config1) + model.set_adapter("adapter1") + with torch.inference_mode(): + output1 = model(inputs).logits + + # sanity check: + tol = 1e-4 + assert not torch.allclose(output0, output1, atol=tol, rtol=tol) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id).to(self.torch_device) + model = PeftModel.from_pretrained(model, os.path.join(tmp_dirname, "adapter0")).eval() + if do_hotswap: + prepare_model_for_compiled_hotswap(model, config=model.peft_config, target_rank=max(ranks)) + model = torch.compile(model, mode="reduce-overhead") + output_after0 = model(inputs).logits + assert torch.allclose(output0, output_after0, atol=tol, rtol=tol) + + # swap and check that we get the output from adapter1 + if do_hotswap: + hotswap_adapter(model, os.path.join(tmp_dirname, "adapter1"), adapter_name="default") + else: + model.load_adapter(os.path.join(tmp_dirname, "adapter1"), adapter_name="other") + model.set_adapter("other") + + # we need to call forward to potentially trigger recompilation + output_after1 = model(inputs).logits + assert torch.allclose(output1, output_after1, atol=tol, rtol=tol) + + # we need to call forward third time since cudagraphs are not recorded in first call. + if do_hotswap: + hotswap_adapter(model, os.path.join(tmp_dirname, "adapter0"), adapter_name="default") + output_after2 = model(inputs).logits + assert torch.allclose(output0, output_after2, atol=tol, rtol=tol) + + # it is important to check hotswapping small to large ranks and large to small ranks + @pytest.mark.parametrize("ranks", [(11, 11), (7, 13), (13, 7)]) + def test_hotswapping_compiled_model_does_not_trigger_recompilation(self, ranks): + # here we set three configs to ensure no recompilation or cudagraph re-record occurs: + # 1. error_on_recompile: raise an error on recompilation + # 2. inline_inbuilt_nn_modules: needed to raise an error on static input address changes instead of re-recording + # 3. triton.cudagraph_support_input_mutation: same as above + dynamo_config_ctx = torch._dynamo.config.patch(error_on_recompile=True, inline_inbuilt_nn_modules=False) + inductor_config_ctx = torch._inductor.config.patch("triton.cudagraph_support_input_mutation", False) + with dynamo_config_ctx, inductor_config_ctx: + self.check_hotswap(do_hotswap=True, ranks=ranks, alpha_scalings=ranks) + + def test_no_hotswapping_compiled_model_triggers_recompilation(self): + # contingency test to ensure that hotswapping is actually needed to prevent recompilation + ranks = 7, 13 + with torch._dynamo.config.patch(error_on_recompile=True): + with pytest.raises(torch._dynamo.exc.RecompileError): # raise an error on recompilation + self.check_hotswap(do_hotswap=False, ranks=ranks, alpha_scalings=ranks) + + ################### + # DIFFUSION MODEL # + ################### + + def get_small_unet(self): + # from diffusers UNet2DConditionModelTests + from diffusers import UNet2DConditionModel + + torch.manual_seed(0) + init_dict = { + "block_out_channels": (4, 8), + "norm_num_groups": 4, + "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"), + "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"), + "cross_attention_dim": 8, + "attention_head_dim": 2, + "out_channels": 4, + "in_channels": 4, + "layers_per_block": 1, + "sample_size": 16, + } + model = UNet2DConditionModel(**init_dict) + return model.to(self.torch_device) + + def get_unet_lora_config(self, lora_rank, lora_alpha, target_modules): + # from diffusers test_models_unet_2d_condition.py + # note that this only targets linear layers by default + unet_lora_config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=target_modules, + init_lora_weights=False, + use_dora=False, + ) + return unet_lora_config + + def get_dummy_input(self): + pipeline_inputs = { + "prompt": "A painting of a squirrel eating a burger", + "num_inference_steps": 5, + "guidance_scale": 6.0, + "output_type": "np", + "return_dict": False, + } + return pipeline_inputs + + def set_lora_device(self, model, adapter_names, device): + # copied from diffusers LoraBaseMixin.set_lora_device + for module in model.modules(): + if isinstance(module, BaseTunerLayer): + for adapter_name in adapter_names: + module.lora_A[adapter_name].to(device) + module.lora_B[adapter_name].to(device) + # this is a param, not a module, so device placement is not in-place -> re-assign + if hasattr(module, "lora_magnitude_vector") and module.lora_magnitude_vector is not None: + if adapter_name in module.lora_magnitude_vector: + module.lora_magnitude_vector[adapter_name] = module.lora_magnitude_vector[adapter_name].to( + device + ) + + def check_hotswap_diffusion(self, ranks, alpha_scalings, target_modules): + """ + Check that hotswapping works on a pipeline. + + This is essentially the same test as: + https://github.com/huggingface/diffusers/blob/d7dd924ece56cddf261cd8b9dd901cbfa594c62c/tests/pipelines/test_pipelines.py#L2264 + + Steps: + - create 2 LoRA adapters and save them + - load the first adapter + - hotswap the second adapter + - check that the outputs are correct + - optionally compile the model + + Note: We set rank == alpha here because save_lora_adapter does not save the alpha scalings, thus the test would + fail if the values are different. Since rank != alpha does not matter for the purpose of this test, this is + fine. + """ + from diffusers import StableDiffusionPipeline + + # create 2 adapters with different ranks and alphas + dummy_input = self.get_dummy_input() + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + rank0, rank1 = ranks + alpha0, alpha1 = alpha_scalings + max_rank = max([rank0, rank1]) + lora_config0 = self.get_unet_lora_config(rank0, alpha0, target_modules) + lora_config1 = self.get_unet_lora_config(rank1, alpha1, target_modules) + + torch.manual_seed(0) + pipeline.unet.add_adapter(lora_config0, adapter_name="adapter0") + output0_before = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + torch.manual_seed(1) + pipeline.unet.add_adapter(lora_config1, adapter_name="adapter1") + pipeline.unet.set_adapter("adapter1") + output1_before = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check + tol = 1e-3 + assert not np.allclose(output0_before, output1_before, atol=tol, rtol=tol) + assert not (output0_before == 0).all() + assert not (output1_before == 0).all() + + with tempfile.TemporaryDirectory() as tmp_dirname: + # save the adapter checkpoints + sd0 = get_peft_model_state_dict(pipeline.unet, adapter_name="adapter0") + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter0"), safe_serialization=True, unet_lora_layers=sd0 + ) + sd1 = get_peft_model_state_dict(pipeline.unet, adapter_name="adapter1") + StableDiffusionPipeline.save_lora_weights( + save_directory=os.path.join(tmp_dirname, "adapter1"), safe_serialization=True, unet_lora_layers=sd1 + ) + del pipeline + + # load the first adapter + pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe").to(torch_device) + # no need to prepare if the model is not compiled or if the ranks are identical + pipeline.enable_lora_hotswap(target_rank=max_rank) + + file_name0 = os.path.join(tmp_dirname, "adapter0", "pytorch_lora_weights.safetensors") + file_name1 = os.path.join(tmp_dirname, "adapter1", "pytorch_lora_weights.safetensors") + + pipeline.load_lora_weights(file_name0) + pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead") + + output0_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check: still same result + assert np.allclose(output0_before, output0_after, atol=tol, rtol=tol) + + # hotswap the 2nd adapter + pipeline.load_lora_weights(file_name1, hotswap=True, adapter_name="default_0") + output1_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + + # sanity check: since it's the same LoRA, the results should be identical + assert np.allclose(output1_before, output1_after, atol=tol, rtol=tol) + + # we need to call forward third time since cudagraphs are not recorded in first call. + pipeline.load_lora_weights(file_name0, hotswap=True, adapter_name="default_0") + output2_after = pipeline(**dummy_input, generator=torch.manual_seed(0))[0] + assert np.allclose(output0_before, output2_after, atol=tol, rtol=tol) + + @pytest.mark.skipif(not is_diffusers_available(), reason="Test requires diffusers to be installed") + # it is important to check hotswapping small to large ranks and large to small ranks + @pytest.mark.parametrize("ranks", [(11, 11), (7, 13), (13, 7)]) + @pytest.mark.parametrize( + "target_modules", + [ + ["to_q", "to_k", "to_v", "to_out.0"], # Linear layers + ["conv", "conv1", "conv2"], # Conv2d layers + ["to_q", "conv"], # mix of Linear and Conv2d + ], + ) + def test_hotswapping_compiled_diffusers_model_does_not_trigger_recompilation(self, ranks, target_modules): + # here we set three configs to ensure no recompilation or cudagraph re-record occurs: + # 1. error_on_recompile: raise an error on recompilation + # 2. inline_inbuilt_nn_modules: needed to raise an error on static input address changes instead of re-recording + # 3. triton.cudagraph_support_input_mutation: same as above + dynamo_config_ctx = torch._dynamo.config.patch(error_on_recompile=True, inline_inbuilt_nn_modules=False) + inductor_config_ctx = torch._inductor.config.patch("triton.cudagraph_support_input_mutation", False) + with dynamo_config_ctx, inductor_config_ctx: + self.check_hotswap_diffusion(ranks=ranks, alpha_scalings=ranks, target_modules=target_modules) + + +# Test: 4-bit load + Arrow + generate +class TestArrowQuantized: + @pytest.fixture(scope="class") + def workdir(self, tmp_path_factory): + """Create and return a temp directory path for this class (no chdir).""" + wd = tmp_path_factory.mktemp("arrow_workdir") + return Path(wd) + + def _create_and_save_adapter_opt(self, out_dir: Path, rank: int = 4): + """ + Build a randomly initialized LoRA adapter for OPT-125M and save into `out_dir`. We construct a model from + CONFIG (no pretrained weights) to avoid slow downloads here. + """ + model_id = "facebook/opt-125m" + # Target all linear layers so the adapter matches whatever we later quantize/load. + lora_cfg = LoraConfig( + r=rank, + target_modules="all-linear", + task_type="CAUSAL_LM", + init_lora_weights=False, + ) + # Load the adapter on the model and save it + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + peft_model = get_peft_model(model, lora_cfg) + peft_model.save_pretrained(out_dir) + + @pytest.fixture(scope="class") + def ts_adapters_opt(self, workdir: Path): + """ + Build 3 locally-saved task-specific adapters for OPT-125M and return their absolute paths. + """ + paths = [] + for i in range(3): + sub = workdir / f"ts_expert_{i}" + self._create_and_save_adapter_opt(sub) + paths.append(str(sub)) + return paths + + @require_bitsandbytes + @pytest.mark.single_gpu_tests + def test_arrow_4bit_opt125m_load_and_generate_with_local_adapters(self, ts_adapters_opt): + # Skip if CUDA or bitsandbytes isn’t available + if not torch.cuda.is_available(): + pytest.skip("CUDA required for 4-bit bitsandbytes test.") + + model_id = "facebook/opt-125m" + + # Quantization config (nf4, bf16 compute) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=False, + ) + + with hub_online_once(model_id): + # Load quantized base model + base_model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + device_map="auto", + quantization_config=bnb_config, + ) + with hub_online_once(model_id + "tokenizer"): + tok = AutoTokenizer.from_pretrained(model_id, use_fast=True) + + # Build Arrow model from the locally created adapters + arrow_cfg = ArrowConfig(top_k=2, router_temperature=1.0, rng_seed=42) + model = create_arrow_model( + base_model=base_model, + task_specific_adapter_paths=ts_adapters_opt, # local dirs (each has adapter_config.json) + arrow_config=arrow_cfg, + ).eval() + + # Quick generate smoke test + inputs = tok("Hello world", return_tensors="pt") + inputs = {k: v.to(model.device) for k, v in inputs.items()} + with torch.no_grad(): + out = model.generate(**inputs, max_new_tokens=8) + + assert out is not None + assert out.shape[0] == 1 # batch size 1 diff --git a/peft/tests/test_helpers.py b/peft/tests/test_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..501bd146a2900cd3266cd5bed1cfe747da308811 --- /dev/null +++ b/peft/tests/test_helpers.py @@ -0,0 +1,473 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import torch +from diffusers import StableDiffusionPipeline +from torch import nn +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import LoraConfig, get_peft_model +from peft.helpers import check_if_peft_model, disable_input_dtype_casting, rescale_adapter_scale +from peft.tuners.lora.layer import LoraLayer +from peft.utils import infer_device + + +class TestCheckIsPeftModel: + def test_valid_hub_model(self): + result = check_if_peft_model("peft-internal-testing/gpt2-lora-random") + assert result is True + + def test_invalid_hub_model(self): + result = check_if_peft_model("gpt2") + assert result is False + + def test_nonexisting_hub_model(self): + result = check_if_peft_model("peft-internal-testing/non-existing-model") + assert result is False + + def test_local_model_valid(self, tmp_path): + model = AutoModelForCausalLM.from_pretrained("gpt2") + config = LoraConfig() + model = get_peft_model(model, config) + model.save_pretrained(tmp_path / "peft-gpt2-valid") + result = check_if_peft_model(tmp_path / "peft-gpt2-valid") + assert result is True + + def test_local_model_invalid(self, tmp_path): + model = AutoModelForCausalLM.from_pretrained("gpt2") + model.save_pretrained(tmp_path / "peft-gpt2-invalid") + result = check_if_peft_model(tmp_path / "peft-gpt2-invalid") + assert result is False + + def test_local_model_broken_config(self, tmp_path): + with open(tmp_path / "adapter_config.json", "w") as f: + f.write('{"foo": "bar"}') + + result = check_if_peft_model(tmp_path) + assert result is False + + def test_local_model_non_default_name(self, tmp_path): + model = AutoModelForCausalLM.from_pretrained("gpt2") + config = LoraConfig() + model = get_peft_model(model, config, adapter_name="other") + model.save_pretrained(tmp_path / "peft-gpt2-other") + + # no default adapter here + result = check_if_peft_model(tmp_path / "peft-gpt2-other") + assert result is False + + # with adapter name + result = check_if_peft_model(tmp_path / "peft-gpt2-other" / "other") + assert result is True + + +class TestScalingAdapters: + @pytest.fixture(scope="class") + def tokenizer(self): + return AutoTokenizer.from_pretrained("facebook/opt-125m") + + def get_scale_from_modules(self, model): + layer_to_scale_map = {} + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + layer_to_scale_map[name] = module.scaling + + return layer_to_scale_map + + def test_rescale_adapter_scale(self, tokenizer): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + ) + + model = get_peft_model(model, lora_config) + model.eval() + inputs = tokenizer("hello world", return_tensors="pt") + + with torch.no_grad(): + logits_before_scaling = model(**inputs).logits + + scales_before_scaling = self.get_scale_from_modules(model) + + with rescale_adapter_scale(model=model, multiplier=0.5): + scales_during_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] != scales_during_scaling[key] + + with torch.no_grad(): + logits_during_scaling = model(**inputs).logits + + assert not torch.allclose(logits_before_scaling, logits_during_scaling) + + scales_after_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] == scales_after_scaling[key] + + with torch.no_grad(): + logits_after_scaling = model(**inputs).logits + + assert torch.allclose(logits_before_scaling, logits_after_scaling) + + def test_wrong_scaling_datatype(self): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + ) + + model = get_peft_model(model, lora_config) + + # we expect a type error here becuase of wrong datatpye of multiplier + multiplier = "a" + with pytest.raises(TypeError, match=f"Argument multiplier should be of type float, got {type(multiplier)}"): + with rescale_adapter_scale(model=model, multiplier=multiplier): + pass + + def test_not_lora_model(self): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + + # we expect a value error here because the model + # does not have lora layers + with pytest.raises(ValueError, match="scaling is only supported for models with `LoraLayer`s"): + with rescale_adapter_scale(model=model, multiplier=0.5): + pass + + def test_scaling_set_to_zero(self, tokenizer): + base_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + inputs = tokenizer("hello world", return_tensors="pt") + + base_model.eval() + + with torch.no_grad(): + logits_base_model = base_model(**inputs).logits + + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + ) + lora_model = get_peft_model(base_model, lora_config) + lora_model.eval() + + with rescale_adapter_scale(model=lora_model, multiplier=0.0): + with torch.no_grad(): + logits_lora_model = lora_model(**inputs).logits + + assert torch.allclose(logits_base_model, logits_lora_model) + + def test_diffusers_pipeline(self): + model_id = "hf-internal-testing/tiny-sd-pipe" + pipeline = StableDiffusionPipeline.from_pretrained(model_id) + + text_encoder_kwargs = { + "r": 8, + "lora_alpha": 32, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "lora_dropout": 0.0, + "bias": "none", + } + unet_kwargs = { + "r": 8, + "lora_alpha": 32, + "target_modules": ["proj_in", "proj_out", "to_k", "to_q", "to_v", "to_out.0", "ff.net.0.proj", "ff.net.2"], + "lora_dropout": 0.0, + "bias": "none", + } + + # Instantiate text_encoder adapter + config_text_encoder = LoraConfig(**text_encoder_kwargs) + pipeline.text_encoder = get_peft_model(pipeline.text_encoder, config_text_encoder) + + # Instantiate unet adapter + config_unet = LoraConfig(**unet_kwargs) + pipeline.unet = get_peft_model(pipeline.unet, config_unet) + + text_scales_before_scaling = self.get_scale_from_modules(pipeline.text_encoder) + unet_scales_before_scaling = self.get_scale_from_modules(pipeline.unet) + + with ( + rescale_adapter_scale(model=pipeline.text_encoder, multiplier=0.5), + rescale_adapter_scale(model=pipeline.unet, multiplier=0.5), + ): + text_scales_during_scaling = self.get_scale_from_modules(pipeline.text_encoder) + unet_scales_during_scaling = self.get_scale_from_modules(pipeline.unet) + for key in text_scales_before_scaling.keys(): + assert text_scales_before_scaling[key] != text_scales_during_scaling[key] + for key in unet_scales_before_scaling.keys(): + assert unet_scales_before_scaling[key] != unet_scales_during_scaling[key] + + text_scales_fter_scaling = self.get_scale_from_modules(pipeline.text_encoder) + unet_scales_after_scaling = self.get_scale_from_modules(pipeline.unet) + for key in text_scales_before_scaling.keys(): + assert text_scales_before_scaling[key] == text_scales_fter_scaling[key] + for key in unet_scales_before_scaling.keys(): + assert unet_scales_before_scaling[key] == unet_scales_after_scaling[key] + + def test_transformers_pipeline(self, tmp_path, tokenizer): + # this uses a transformers model that loads the adapter directly + model_id = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig(init_lora_weights=False) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path / "opt-lora") + del model + + # load directly into transformers model + model = AutoModelForCausalLM.from_pretrained(model_id) + model.load_adapter(tmp_path / "opt-lora") + + inputs = tokenizer("hello world", return_tensors="pt") + + model = model.eval() + + with torch.no_grad(): + logits_before_scaling = model(**inputs).logits + scales_before_scaling = self.get_scale_from_modules(model) + + with rescale_adapter_scale(model=model, multiplier=0.5): + scales_during_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] != scales_during_scaling[key] + with torch.no_grad(): + logits_during_scaling = model(**inputs).logits + assert not torch.allclose(logits_before_scaling, logits_during_scaling) + scales_after_scaling = self.get_scale_from_modules(model) + + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] == scales_after_scaling[key] + + with torch.no_grad(): + logits_after_scaling = model(**inputs).logits + + assert torch.allclose(logits_before_scaling, logits_after_scaling) + + def test_multi_adapters(self, tokenizer): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + ) + model = get_peft_model(model, lora_config) + inputs = tokenizer("hello world", return_tensors="pt") + + # add another adaper and activate it + model.add_adapter("other", lora_config) + model.set_adapter("other") + + scales_before_scaling = self.get_scale_from_modules(model) + model.eval() + with torch.no_grad(): + logits_before = model(**inputs).logits + + with rescale_adapter_scale(model=model, multiplier=0.5): + scales_during_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] != scales_during_scaling[key] + + with torch.no_grad(): + logits_during = model(**inputs).logits + + assert not torch.allclose(logits_before, logits_during) + + scales_after_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] == scales_after_scaling[key] + + with torch.no_grad(): + logits_after = model(**inputs).logits + + assert torch.allclose(logits_before, logits_after) + + def test_rank_alpha_pattern(self, tokenizer): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + rank_pattern={"k_proj": 2}, + alpha_pattern={"k_proj": 8}, + ) + + model = get_peft_model(model, lora_config) + model.eval() + inputs = tokenizer("hello world", return_tensors="pt") + + with torch.no_grad(): + logits_before_scaling = model(**inputs).logits + + scales_before_scaling = self.get_scale_from_modules(model) + + with rescale_adapter_scale(model=model, multiplier=0.5): + scales_during_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] != scales_during_scaling[key] + + with torch.no_grad(): + logits_during_scaling = model(**inputs).logits + + assert not torch.allclose(logits_before_scaling, logits_during_scaling) + + scales_after_scaling = self.get_scale_from_modules(model) + for key in scales_before_scaling.keys(): + assert scales_before_scaling[key] == scales_after_scaling[key] + + with torch.no_grad(): + logits_after_scaling = model(**inputs).logits + + assert torch.allclose(logits_before_scaling, logits_after_scaling) + + def test_merging_adapter(self, tokenizer): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + lora_config = LoraConfig( + r=4, + lora_alpha=4, + target_modules=["k_proj", "v_proj"], + lora_dropout=0.1, + bias="none", + init_lora_weights=False, + ) + + model = get_peft_model(model, lora_config) + model.eval() + inputs = tokenizer("hello world", return_tensors="pt") + + with rescale_adapter_scale(model=model, multiplier=0.5): + with torch.no_grad(): + logits_unmerged_scaling = model(**inputs).logits + model = model.merge_and_unload() + + with torch.no_grad(): + logits_merged_scaling = model(**inputs).logits + + assert torch.allclose(logits_merged_scaling, logits_unmerged_scaling, atol=1e-4, rtol=1e-4) + + +class TestDisableInputDtypeCasting: + """Test the context manager `disable_input_dtype_casting` that temporarily disables input dtype casting + in the model. + + The test works as follows: + + We create a simple MLP and convert it to a PeftModel. The model dtype is set to float16. Then a pre-foward hook is + added that casts the model parameters to float32. Moreover, a post-forward hook is added that casts the weights + back to float16. The input dtype is float32. + + Without the disable_input_dtype_casting context, what would happen is that PEFT detects that the input dtype is + float32 but the weight dtype is float16, so it casts the input to float16. Then the pre-forward hook casts the + weight to float32, which results in a RuntimeError. + + With the disable_input_dtype_casting context, the input dtype is left as float32 and there is no error. We also add + a hook to record the dtype of the result from the LoraLayer to ensure that it is indeed float32. + + """ + + device = infer_device() + dtype_record = [] + + @torch.no_grad() + def cast_params_to_fp32_pre_hook(self, module, input): + for param in module.parameters(recurse=False): + param.data = param.data.float() + return input + + @torch.no_grad() + def cast_params_to_fp16_hook(self, module, input, output): + for param in module.parameters(recurse=False): + param.data = param.data.half() + return output + + def record_dtype_hook(self, module, input, output): + self.dtype_record.append(output[0].dtype) + + @pytest.fixture + def inputs(self): + return torch.randn(4, 10, device=self.device, dtype=torch.float32) + + @pytest.fixture + def base_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + X = self.sm(X) + return X + + return MLP() + + @pytest.fixture + def model(self, base_model): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(base_model, config).to(device=self.device, dtype=torch.float16) + # Register hooks on the submodule that holds parameters + for module in model.modules(): + if sum(p.numel() for p in module.parameters()) > 0: + module.register_forward_pre_hook(self.cast_params_to_fp32_pre_hook) + module.register_forward_hook(self.cast_params_to_fp16_hook) + if isinstance(module, LoraLayer): + module.register_forward_hook(self.record_dtype_hook) + return model + + def test_disable_input_dtype_casting_active(self, model, inputs): + self.dtype_record.clear() + with disable_input_dtype_casting(model, active=True): + model(inputs) + assert self.dtype_record == [torch.float32] + + def test_no_disable_input_dtype_casting(self, model, inputs): + msg = r"expected m.*1 and m.*2 to have the same dtype" + with pytest.raises(RuntimeError, match=msg): + model(inputs) + + def test_disable_input_dtype_casting_inactive(self, model, inputs): + msg = r"expected m.*1 and m.*2 to have the same dtype" + with pytest.raises(RuntimeError, match=msg): + with disable_input_dtype_casting(model, active=False): + model(inputs) + + def test_disable_input_dtype_casting_inactive_after_existing_context(self, model, inputs): + # this is to ensure that when the context is left, we return to the previous behavior + with disable_input_dtype_casting(model, active=True): + model(inputs) + + # after the context exited, we're back to the error + msg = r"expected m.*1 and m.*2 to have the same dtype" + with pytest.raises(RuntimeError, match=msg): + model(inputs) diff --git a/peft/tests/test_hub_features.py b/peft/tests/test_hub_features.py new file mode 100644 index 0000000000000000000000000000000000000000..f705167c08f26d0ec7df854b6e4b06d0e4a5c085 --- /dev/null +++ b/peft/tests/test_hub_features.py @@ -0,0 +1,236 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy + +import pytest +import torch +from huggingface_hub import ModelCard +from transformers import AutoModelForCausalLM + +from peft import AutoPeftModelForCausalLM, BoneConfig, LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model + +from .testing_utils import hub_online_once + + +PEFT_MODELS_TO_TEST = [("peft-internal-testing/test-lora-subfolder", "test")] + + +class PeftHubFeaturesTester: + # TODO remove when/if Hub is more stable + @pytest.mark.xfail(reason="Test is flaky on CI", raises=ValueError) + def test_subfolder(self): + r""" + Test if subfolder argument works as expected + """ + for model_id, subfolder in PEFT_MODELS_TO_TEST: + config = PeftConfig.from_pretrained(model_id, subfolder=subfolder) + + model = AutoModelForCausalLM.from_pretrained( + config.base_model_name_or_path, + ) + model = PeftModel.from_pretrained(model, model_id, subfolder=subfolder) + + assert isinstance(model, PeftModel) + + +class TestLocalModel: + def test_local_model_saving_no_warning(self, recwarn, tmp_path): + # When the model is saved, the library checks for vocab changes by + # examining `config.json` in the model path. + # However, previously, those checks only covered huggingface hub models. + # This test makes sure that the local `config.json` is checked as well. + # If `save_pretrained` could not find the file, it will issue a warning. + model_id = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_id) + local_dir = tmp_path / model_id + model.save_pretrained(local_dir) + del model + + base_model = AutoModelForCausalLM.from_pretrained(local_dir) + peft_config = LoraConfig() + peft_model = get_peft_model(base_model, peft_config) + peft_model.save_pretrained(local_dir) + + for warning in recwarn.list: + assert "Could not find a config file" not in warning.message.args[0] + + +class TestBaseModelRevision: + def test_save_and_load_base_model_revision(self, tmp_path): + r""" + Test saving a PeftModel with a base model revision and loading with AutoPeftModel to recover the same base + model + """ + lora_config = LoraConfig(r=8, lora_alpha=16, lora_dropout=0.0) + test_inputs = torch.arange(10).reshape(-1, 1) + + base_model_id = "peft-internal-testing/tiny-random-BertModel" + revision = "v2.0.0" + + base_model_revision = AutoModelForCausalLM.from_pretrained(base_model_id, revision=revision).eval() + peft_model_revision = get_peft_model(base_model_revision, lora_config, revision=revision) + output_revision = peft_model_revision(test_inputs).logits + + # sanity check: the model without revision should be different + base_model_no_revision = AutoModelForCausalLM.from_pretrained(base_model_id, revision="main").eval() + # we need a copy of the config because otherwise, we are changing in-place the `revision` of the previous config and model + lora_config_no_revision = copy.deepcopy(lora_config) + lora_config_no_revision.revision = "main" + peft_model_no_revision = get_peft_model(base_model_no_revision, lora_config_no_revision, revision="main") + output_no_revision = peft_model_no_revision(test_inputs).logits + assert not torch.allclose(output_no_revision, output_revision) + + # check that if we save and load the model, the output corresponds to the one with revision + peft_model_revision.save_pretrained(tmp_path / "peft_model_revision") + peft_model_revision_loaded = AutoPeftModelForCausalLM.from_pretrained(tmp_path / "peft_model_revision").eval() + + assert peft_model_revision_loaded.peft_config["default"].revision == revision + + output_revision_loaded = peft_model_revision_loaded(test_inputs).logits + assert torch.allclose(output_revision, output_revision_loaded) + + # TODO remove when/if Hub is more stable + @pytest.mark.xfail(reason="Test is flaky on CI", raises=ValueError) + def test_load_different_peft_and_base_model_revision(self, tmp_path): + r""" + Test loading an AutoPeftModel from the hub where the base model revision and peft revision differ + """ + base_model_id = "hf-internal-testing/tiny-random-BertModel" + base_model_revision = None + peft_model_id = "peft-internal-testing/tiny-random-BertModel-lora" + peft_model_revision = "v1.2.3" + + peft_model = AutoPeftModelForCausalLM.from_pretrained(peft_model_id, revision=peft_model_revision).eval() + + assert peft_model.peft_config["default"].base_model_name_or_path == base_model_id + assert peft_model.peft_config["default"].revision == base_model_revision + + +class TestModelCard: + @pytest.mark.parametrize( + "model_id, peft_config, tags, excluded_tags, pipeline_tag", + [ + ( + "hf-internal-testing/tiny-random-Gemma3ForCausalLM", + LoraConfig(), + ["transformers", "base_model:adapter:hf-internal-testing/tiny-random-Gemma3ForCausalLM", "lora"], + [], + None, + ), + ( + "hf-internal-testing/tiny-random-Gemma3ForCausalLM", + BoneConfig(), + ["transformers", "base_model:adapter:hf-internal-testing/tiny-random-Gemma3ForCausalLM"], + ["lora"], + None, + ), + ( + "hf-internal-testing/tiny-random-BartForConditionalGeneration", + LoraConfig(), + [ + "transformers", + "base_model:adapter:hf-internal-testing/tiny-random-BartForConditionalGeneration", + "lora", + ], + [], + None, + ), + ( + "hf-internal-testing/tiny-random-Gemma3ForCausalLM", + LoraConfig(task_type=TaskType.CAUSAL_LM), + ["transformers", "base_model:adapter:hf-internal-testing/tiny-random-Gemma3ForCausalLM", "lora"], + [], + "text-generation", + ), + ], + ) + @pytest.mark.parametrize( + "pre_tags", + [ + ["tag1", "tag2"], + [], + ], + ) + def test_model_card_has_expected_tags( + self, model_id, peft_config, tags, excluded_tags, pipeline_tag, pre_tags, tmp_path + ): + """Make sure that PEFT sets the tags in the model card automatically and correctly. + This is important so that a) the models are searchable on the Hub and also 2) some features depend on it to + decide how to deal with them (e.g., inference). + + Makes sure that the base model tags are still present (if there are any). + """ + with hub_online_once(model_id): + base_model = AutoModelForCausalLM.from_pretrained(model_id) + + if pre_tags: + base_model.add_model_tags(pre_tags) + + peft_model = get_peft_model(base_model, peft_config) + save_path = tmp_path / "adapter" + + peft_model.save_pretrained(save_path) + + model_card = ModelCard.load(save_path / "README.md") + assert set(tags).issubset(set(model_card.data.tags)) + + if excluded_tags: + assert set(excluded_tags).isdisjoint(set(model_card.data.tags)) + + if pre_tags: + assert set(pre_tags).issubset(set(model_card.data.tags)) + + if pipeline_tag: + assert model_card.data.pipeline_tag == pipeline_tag + + @pytest.fixture + def custom_model_cls(self): + class MyNet(torch.nn.Module): + def __init__(self): + super().__init__() + self.l1 = torch.nn.Linear(10, 20) + self.l2 = torch.nn.Linear(20, 1) + + def forward(self, X): + return self.l2(self.l1(X)) + + return MyNet + + def test_custom_models_dont_have_transformers_tag(self, custom_model_cls, tmp_path): + base_model = custom_model_cls() + peft_config = LoraConfig(target_modules="all-linear") + peft_model = get_peft_model(base_model, peft_config) + + peft_model.save_pretrained(tmp_path) + + model_card = ModelCard.load(tmp_path / "README.md") + + assert model_card.data.tags is not None + assert "transformers" not in model_card.data.tags + + def test_custom_peft_type_does_not_raise(self, tmp_path): + # Passing a string value as peft_type value in the config is valid, so it should work. + # See https://github.com/huggingface/peft/issues/2634 + model_id = "hf-internal-testing/tiny-random-Gemma3ForCausalLM" + with hub_online_once(model_id): + base_model = AutoModelForCausalLM.from_pretrained(model_id) + peft_config = LoraConfig() + + # We simulate a custom PEFT type by using a string value of an existing method. This skips the need for + # registering a new method but tests the case where we pass a string value instead of an enum. + peft_type = "LORA" + peft_config.peft_type = peft_type + + peft_model = get_peft_model(base_model, peft_config) + peft_model.save_pretrained(tmp_path) diff --git a/peft/tests/test_incremental_pca.py b/peft/tests/test_incremental_pca.py new file mode 100644 index 0000000000000000000000000000000000000000..8240899d1b9310ade41495febb3c65242e6d9957 --- /dev/null +++ b/peft/tests/test_incremental_pca.py @@ -0,0 +1,188 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Adapted from https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/decomposition/tests/test_incremental_pca.py + +import pytest +import torch +from datasets import load_dataset +from torch.testing import assert_close + +from peft.utils.incremental_pca import IncrementalPCA + + +torch.manual_seed(1999) + + +@pytest.fixture(scope="module") +def iris(): + return load_dataset("scikit-learn/iris", split="train") + + +def test_incremental_pca(iris): + # Incremental PCA on dense arrays. + n_components = 2 + X = torch.tensor([iris["SepalLengthCm"], iris["SepalWidthCm"], iris["PetalLengthCm"], iris["PetalWidthCm"]]).T + batch_size = X.shape[0] // 3 + ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) + ipca.fit(X) + X_transformed = ipca.transform(X) + + # PCA + U, S, Vh = torch.linalg.svd(X - torch.mean(X, dim=0)) + max_abs_rows = torch.argmax(torch.abs(Vh), dim=1) + signs = torch.sign(Vh[range(Vh.shape[0]), max_abs_rows]) + Vh *= signs.view(-1, 1) + explained_variance = S**2 / (X.size(0) - 1) + explained_variance_ratio = explained_variance / explained_variance.sum() + + assert X_transformed.shape == (X.shape[0], 2) + assert_close( + ipca.explained_variance_ratio_.sum().item(), + explained_variance_ratio[:n_components].sum().item(), + rtol=1e-3, + atol=1e-3, + ) + + +def test_incremental_pca_check_projection(): + # Test that the projection of data is correct. + n, p = 100, 3 + X = torch.randn(n, p, dtype=torch.float64) * 0.1 + X[:10] += torch.tensor([3, 4, 5]) + Xt = 0.1 * torch.randn(1, p, dtype=torch.float64) + torch.tensor([3, 4, 5]) + + # Get the reconstruction of the generated data X + # Note that Xt has the same "components" as X, just separated + # This is what we want to ensure is recreated correctly + Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt) + + # Normalize + Yt /= torch.sqrt((Yt**2).sum()) + + # Make sure that the first element of Yt is ~1, this means + # the reconstruction worked as expected + assert_close(torch.abs(Yt[0][0]).item(), 1.0, atol=1e-1, rtol=1e-1) + + +def test_incremental_pca_validation(): + # Test that n_components is <= n_features. + X = torch.tensor([[0, 1, 0], [1, 0, 0]]) + n_samples, n_features = X.shape + n_components = 4 + with pytest.raises( + ValueError, + match=( + f"n_components={n_components} invalid" + f" for n_features={n_features}, need more rows than" + " columns for IncrementalPCA" + " processing" + ), + ): + IncrementalPCA(n_components, batch_size=10).fit(X) + + # Tests that n_components is also <= n_samples. + n_components = 3 + with pytest.raises( + ValueError, + match=(f"n_components={n_components} must be less or equal to the batch number of samples {n_samples}"), + ): + IncrementalPCA(n_components=n_components).partial_fit(X) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + for n_samples, n_features in [(50, 10), (10, 50)]: + X = torch.rand(n_samples, n_features) + ipca = IncrementalPCA(n_components=None) + + # First partial_fit call, ipca.n_components_ is inferred from + # min(X.shape) + ipca.partial_fit(X) + assert ipca.n_components == min(X.shape) + + +def test_incremental_pca_num_features_change(): + # Test that changing n_components will raise an error. + n_samples = 100 + X = torch.randn(n_samples, 20) + X2 = torch.randn(n_samples, 50) + ipca = IncrementalPCA(n_components=None) + ipca.fit(X) + with pytest.raises(ValueError): + ipca.partial_fit(X2) + + +def test_incremental_pca_batch_signs(): + # Test that components_ sign is stable over batch sizes. + n_samples = 100 + n_features = 3 + X = torch.randn(n_samples, n_features) + all_components = [] + batch_sizes = torch.arange(10, 20) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) + all_components.append(ipca.components_) + + for i, j in zip(all_components[:-1], all_components[1:]): + assert_close(torch.sign(i), torch.sign(j), rtol=1e-6, atol=1e-6) + + +def test_incremental_pca_batch_values(): + # Test that components_ values are stable over batch sizes. + n_samples = 100 + n_features = 3 + X = torch.randn(n_samples, n_features) + all_components = [] + batch_sizes = torch.arange(20, 40, 3) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X) + all_components.append(ipca.components_) + + for i, j in zip(all_components[:-1], all_components[1:]): + assert_close(i, j, rtol=1e-1, atol=1e-1) + + +def test_incremental_pca_partial_fit(): + # Test that fit and partial_fit get equivalent results. + n, p = 50, 3 + X = torch.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += torch.tensor([5, 4, 3]) # make a large mean + + # same check that we can find the original data from the transformed + # signal (since the data is almost of rank n_components) + batch_size = 10 + ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X) + pipca = IncrementalPCA(n_components=2, batch_size=batch_size) + # Add one to make sure endpoint is included + batch_itr = torch.arange(0, n + 1, batch_size) + for i, j in zip(batch_itr[:-1], batch_itr[1:]): + pipca.partial_fit(X[i:j, :]) + assert_close(ipca.components_, pipca.components_, rtol=1e-3, atol=1e-3) + + +def test_incremental_pca_lowrank(iris): + # Test that lowrank mode is equivalent to non-lowrank mode. + n_components = 2 + X = torch.tensor([iris["SepalLengthCm"], iris["SepalWidthCm"], iris["PetalLengthCm"], iris["PetalWidthCm"]]).T + batch_size = X.shape[0] // 3 + + ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) + ipca.fit(X) + + ipcalr = IncrementalPCA(n_components=n_components, batch_size=batch_size, lowrank=True) + ipcalr.fit(X) + + assert_close(ipca.components_, ipcalr.components_, rtol=1e-7, atol=1e-7) diff --git a/peft/tests/test_initialization.py b/peft/tests/test_initialization.py new file mode 100644 index 0000000000000000000000000000000000000000..8937f4b0c1ccdec26d2f441adcbd4cd6f9990fcf --- /dev/null +++ b/peft/tests/test_initialization.py @@ -0,0 +1,4731 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools +import math +import platform +import re +import warnings +from collections import defaultdict +from contextlib import contextmanager +from copy import deepcopy +from unittest.mock import patch + +import pytest +import torch +from datasets import Dataset +from huggingface_hub import snapshot_download +from safetensors.torch import load_file +from scipy import stats +from torch import nn +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import ( + AdaLoraConfig, + C3AConfig, + EvaConfig, + IA3Config, + LoftQConfig, + LoKrConfig, + LoraConfig, + PeftMixedModel, + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, + PeftWarning, + PrefixTuningConfig, + PromptTuningConfig, + RoadConfig, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, + get_eva_state_dict, + get_peft_model, + initialize_lora_eva_weights, + inject_adapter_in_model, + set_peft_model_state_dict, +) +from peft.mapping import PEFT_TYPE_TO_PREFIX_MAPPING +from peft.tuners.lora.config import CordaConfig +from peft.tuners.lora.corda import preprocess_corda +from peft.tuners.lora.layer import LoraLayer +from peft.utils import infer_device +from peft.utils.hotswap import hotswap_adapter, prepare_model_for_compiled_hotswap + +from .testing_utils import load_dataset_english_quotes, require_deterministic_for_xpu + + +try: + from huggingface_hub.utils import reset_sessions +except ImportError: + # this function was removed in hfh v1.0.0 + reset_sessions = None + + +class TestLoraInitialization: + """Test class to check the initialization of LoRA adapters.""" + + torch_device = infer_device() + + def get_uniform(self, amin, amax, size=(10000,)): + unif = torch.distributions.uniform.Uniform(amin, amax) + samples = unif.sample(size) + return samples + + def get_normal(self, mean, std, size=(10000,)): + normal = torch.distributions.normal.Normal(mean, std) + samples = normal.sample(size) + return samples + + def get_model(self, bias=True): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = nn.Linear(1000, 1000, bias=bias) + self.embed = nn.Embedding(1000, 1000) + self.conv2d = nn.Conv2d(100, 100, 3, bias=bias) + + def forward(self, x): + x_int = (100 * x).int() + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.embed(x_int), self.conv2d(x_4d) + + return MyModule().eval().to(self.torch_device) + + @pytest.fixture + def data(self): + return torch.rand(10, 1000).to(self.torch_device) + + def test_lora_linear_init_default(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"]) + model = get_peft_model(model, config) + weight_A = model.linear.lora_A["default"].weight + weight_B = model.linear.lora_B["default"].weight + + # use statistical test to check if weight A is from a uniform distribution + unif = self.get_uniform(weight_A.min().item(), weight_A.max().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value > 0.5 + + # check that weight A is *not* from a normal distribution + normal = self.get_normal(weight_A.mean().item(), weight_A.std().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight B is zero + assert (weight_B == 0.0).all() + + def test_lora_linear_init_gaussian(self): + # use gaussian init + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"], init_lora_weights="gaussian") + model = get_peft_model(model, config) + weight_A = model.linear.lora_A["default"].weight + weight_B = model.linear.lora_B["default"].weight + + # use statistical test to check if weight A is from a normal distribution + normal = self.get_normal(0.0, 1 / config.r) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + + assert p_value > 0.5 + + # check that weight A is *not* from a uniform distribution + unif = self.get_uniform(weight_A.min().item(), weight_A.max().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight B is zero + assert (weight_B == 0.0).all() + + def test_lora_linear_false(self): + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"], init_lora_weights=False) + model = get_peft_model(model, config) + weight_B = model.linear.lora_B["default"].weight + + # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values + # as long as they are not zero, in order to avoid identity transformation. + assert not torch.allclose(weight_B, torch.zeros_like(weight_B)) + + def test_lora_embedding_default(self): + # embedding is initialized as a normal distribution, not kaiming uniform + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["embed"]) + model = get_peft_model(model, config) + weight_A = model.embed.lora_embedding_A["default"] + weight_B = model.embed.lora_embedding_B["default"] + + # use statistical test to check if weight B is from a normal distribution + normal = self.get_normal(0.0, 1.0) + _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + assert p_value > 0.5 + + # check that weight B is *not* from a uniform distribution + unif = self.get_uniform(weight_B.min().item(), weight_B.max().item()) + _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight A is zero + assert (weight_A == 0.0).all() + + def test_lora_embedding_gaussian(self): + # embedding does not change with init_lora_weights="gaussian" vs True + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["embed"], init_lora_weights="gaussian") + model = get_peft_model(model, config) + weight_A = model.embed.lora_embedding_A["default"] + weight_B = model.embed.lora_embedding_B["default"] + + # use statistical test to check if weight B is from a normal distribution + normal = self.get_normal(0.0, 1.0) + _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + assert p_value > 0.5 + + # check that weight B is *not* from a uniform distribution + unif = self.get_uniform(weight_B.min().item(), weight_B.max().item()) + _, p_value = stats.kstest(weight_B.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight A is zero + assert (weight_A == 0.0).all() + + def test_lora_embedding_false(self): + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["embed"], init_lora_weights=False) + model = get_peft_model(model, config) + weight_A = model.embed.lora_embedding_B["default"] + + # with init_lora_weights=False, weight A should *not* be zero. We don't care so much about the actual values + # as long as they are not zero, in order to avoid identity transformation. + assert not torch.allclose(weight_A, torch.zeros_like(weight_A)) + + def test_lora_conv2d_default(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["conv2d"]) + model = get_peft_model(model, config) + weight_A = model.conv2d.lora_A["default"].weight + weight_B = model.conv2d.lora_B["default"].weight + + # use statistical test to check if weight A is from a uniform distribution + unif = self.get_uniform(weight_A.min().item(), weight_A.max().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value > 0.5 + + # check that weight A is *not* from a normal distribution + normal = self.get_normal(weight_A.mean().item(), weight_A.std().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight B is zero + assert (weight_B == 0.0).all() + + def test_lora_conv2d_init_gaussian(self): + # use gaussian init + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["conv2d"], init_lora_weights="gaussian") + model = get_peft_model(model, config) + weight_A = model.conv2d.lora_A["default"].weight + weight_B = model.conv2d.lora_B["default"].weight + + # use statistical test to check if weight A is from a normal distribution + normal = self.get_normal(0.0, 1 / config.r) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), normal.flatten().cpu().numpy()) + assert p_value > 0.5 + + # check that weight A is *not* from a uniform distribution + unif = self.get_uniform(weight_A.min().item(), weight_A.max().item()) + _, p_value = stats.kstest(weight_A.detach().flatten().cpu().numpy(), unif.flatten().cpu().numpy()) + assert p_value < 0.05 + + # check that weight B is zero + assert (weight_B == 0.0).all() + + def test_lora_conv2d_false(self): + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["conv2d"], init_lora_weights=False) + model = get_peft_model(model, config) + weight_B = model.conv2d.lora_B["default"].weight + + # with init_lora_weights=False, weight B should *not* be zero. We don't care so much about the actual values + # as long as they are not zero, in order to avoid identity transformation. + assert not torch.allclose(weight_B, torch.zeros_like(weight_B)) + + def test_lora_init_orthogonal(self): + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal") + model = get_peft_model(model, config) + + weight_A = model.linear.lora_A["default"].weight + weight_B = model.linear.lora_B["default"].weight + + assert not torch.allclose(weight_A, torch.zeros_like(weight_A)) + assert not torch.allclose(weight_B, torch.zeros_like(weight_B)) + assert (weight_B @ weight_A).abs().max() < 1e-6 + + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) + def test_lora_init_orthogonal_half_precision_dtype(self, dtype): + try: + torch.zeros(1, dtype=dtype) + except Exception: + pytest.skip(f"dtype {dtype} not supported on this system, skipping test") + + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal") + model = get_peft_model(model, config).to(dtype) + + weight_A = model.linear.lora_A["default"].weight + weight_B = model.linear.lora_B["default"].weight + + assert weight_A.dtype == dtype + assert weight_B.dtype == dtype + + def test_lora_init_orthogonal_odd_rank_raises(self): + torch.manual_seed(0) + + model = self.get_model() + config = LoraConfig(target_modules=["linear"], init_lora_weights="orthogonal", r=7) + msg = "Orthogonal initialization requires the LoRA rank to be even, got 7 instead." + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_lora_scaling_default(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + + # check scaling factor use_rslora=False + config = LoraConfig(target_modules=["linear", "embed", "conv2d"], lora_alpha=3, r=16, use_rslora=False) + model = get_peft_model(model, config) + + expected_scaling = config.lora_alpha / config.r + + assert model.linear.scaling["default"] == expected_scaling + assert model.embed.scaling["default"] == expected_scaling + assert model.conv2d.scaling["default"] == expected_scaling + + # testcase for bugfix for issue 2194 + def test_rank_alpha_pattern_override(self): + torch.manual_seed(0) + + layer = self.get_model() + model = nn.Sequential(layer, layer) + config = LoraConfig( + target_modules=["linear"], + lora_alpha=1, + r=8, + use_rslora=False, + rank_pattern={"linear": 8}, + alpha_pattern={"0.linear": 2}, + ) + model = get_peft_model(model, config) + scaling_with_rank_pattern = model.model[0].linear.scaling + + layer = self.get_model() + model = nn.Sequential(layer, layer) + config = LoraConfig( + target_modules=["linear"], lora_alpha=1, r=8, use_rslora=False, alpha_pattern={"0.linear": 2} + ) + model = get_peft_model(model, config) + scaling_without_rank_pattern = model.model[0].linear.scaling + + assert scaling_with_rank_pattern == scaling_without_rank_pattern + + def test_lora_pissa_linear_init_default(self, data): + model = self.get_model() + output = model(data)[0] + + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"]) + peft_model = get_peft_model(deepcopy(model), config) + assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + + config = LoraConfig(init_lora_weights="pissa_niter_16", target_modules=["linear"]) + peft_model = get_peft_model(deepcopy(model), config) + assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + + def test_lora_olora_linear_init_default(self, data): + model = self.get_model() + output = model(data)[0] + + # Both OLoRA and olora should work + config = LoraConfig(init_lora_weights="OLoRA", target_modules=["linear"]) + peft_model = get_peft_model(deepcopy(model), config) + assert torch.allclose(output, peft_model(data)[0], atol=1e-06) + + def test_lora_pissa_conversion_same_output_after_loading(self, data, tmp_path): + model = self.get_model() + output_base = model(data)[0] + + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "pissa" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_pissa = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_pissa, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "pissa-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_pissa, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_config_keys_before = list(peft_model.peft_config.keys()) + peft_config_dict_before = peft_model.peft_config["default"].to_dict() + peft_model.save_pretrained( + tmp_path / "pissa-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + peft_config_keys_after = list(peft_model.peft_config.keys()) + peft_config_dict_after = peft_model.peft_config["default"].to_dict() + assert peft_config_keys_before == peft_config_keys_after + assert peft_config_dict_before == peft_config_dict_after + + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_pissa, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_lora_pissa_conversion_same_output_after_loading_with_rank_pattern(self, data, tmp_path): + # same as above, but using rank_pattern + model = self.get_model() + output_base = model(data)[0] + + # use rank_pattern here; note that since there is only a single linear layer, r is completely overridden + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"], r=8, rank_pattern={"linear": 32}) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "pissa" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_pissa = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_pissa, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "pissa-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_pissa, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 32 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "pissa-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_pissa, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 64 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_lora_pissa_conversion_same_output_after_loading_with_alpha_pattern(self, data, tmp_path): + # same as above, but using alpha_pattern + model = self.get_model() + output_base = model(data)[0] + + # use alpha_pattern here; note that since there is only a single linear layer, lora_alpha is completely + # overridden + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"], alpha_pattern={"linear": 5}) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "pissa" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_pissa = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_pissa, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "pissa-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_pissa, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 5 / 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "pissa-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_pissa, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + assert model_converted.base_model.model.linear.scaling["default"] == 10 / 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_lora_pissa_conversion_same_output_after_loading_with_rslora(self, data, tmp_path): + model = self.get_model() + output_base = model(data)[0] + + config = LoraConfig(init_lora_weights="pissa", target_modules=["linear"], r=8, use_rslora=True) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "pissa" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_pissa = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_pissa, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "pissa-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_pissa, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 8 / (8**0.5) + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "pissa-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "pissa-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_pissa, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # same scale as before with a little bit of floating point imprecision + assert model_converted.base_model.model.linear.scaling["default"] == pytest.approx(8 / (8**0.5)) + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_pissa_rank_pattern_and_rslora_raises(self, tmp_path): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + config = LoraConfig( + init_lora_weights="pissa", target_modules=["linear"], r=8, rank_pattern={"linear": 2}, use_rslora=True + ) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "pissa-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + def test_pissa_alpha_pattern_and_rslora_raises(self, tmp_path): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + config = LoraConfig( + init_lora_weights="pissa", target_modules=["linear"], r=8, alpha_pattern={"linear": 2}, use_rslora=True + ) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "pissa-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + def test_olora_conversion_same_output_after_loading(self, data, tmp_path): + model = self.get_model() + output_base = model(data)[0] + + config = LoraConfig(init_lora_weights="olora", target_modules=["linear"], r=8) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.save_pretrained(tmp_path / "init-model") + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_olora = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_olora, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "olora-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_olora, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_config_keys_before = list(peft_model.peft_config.keys()) + peft_config_dict_before = peft_model.peft_config["default"].to_dict() + peft_model.save_pretrained( + tmp_path / "olora-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + peft_config_keys_after = list(peft_model.peft_config.keys()) + peft_config_dict_after = peft_model.peft_config["default"].to_dict() + assert peft_config_keys_before == peft_config_keys_after + assert peft_config_dict_before == peft_config_dict_after + + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_olora, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_olora_conversion_same_output_after_loading_with_rank_pattern(self, data, tmp_path): + # same as above, but using rank_pattern + model = self.get_model() + output_base = model(data)[0] + + # use rank_pattern here; note that since there is only a single linear layer, r is completely overridden + config = LoraConfig(init_lora_weights="olora", target_modules=["linear"], r=8, rank_pattern={"linear": 32}) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.save_pretrained(tmp_path / "init-model") + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_olora = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_olora, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "olora-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_olora, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 32 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "olora-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_olora, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 64 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_olora_conversion_same_output_after_loading_with_alpha_pattern(self, data, tmp_path): + # same as above, but using alpha_pattern + model = self.get_model() + output_base = model(data)[0] + + # use alpha_pattern here; note that since there is only a single linear layer, lora_alpha is completely + # overridden + config = LoraConfig(init_lora_weights="olora", target_modules=["linear"], alpha_pattern={"linear": 5}) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.save_pretrained(tmp_path / "init-model") + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_olora = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_olora, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "olora-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_olora, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 5 / 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "olora-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_olora, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + assert model_converted.base_model.model.linear.scaling["default"] == 10 / 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_olora_conversion_same_output_after_loading_with_rslora(self, data, tmp_path): + # same as above, but using alpha_pattern + model = self.get_model() + output_base = model(data)[0] + + config = LoraConfig(init_lora_weights="olora", target_modules=["linear"], r=8, use_rslora=True) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.save_pretrained(tmp_path / "init-model") + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_olora = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_olora, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "olora-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_olora, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 8 / (8**0.5) + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "olora-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "olora-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_olora, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # same scale as before with a little bit of floating point imprecision + assert model_converted.base_model.model.linear.scaling["default"] == pytest.approx(8 / (8**0.5)) + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + def test_olora_rank_pattern_and_rslora_raises(self, tmp_path): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + config = LoraConfig( + init_lora_weights="olora", target_modules=["linear"], r=8, rank_pattern={"linear": 2}, use_rslora=True + ) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "olora-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + def test_olora_alpha_pattern_and_rslora_raises(self, tmp_path): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + config = LoraConfig( + init_lora_weights="olora", target_modules=["linear"], r=8, alpha_pattern={"linear": 2}, use_rslora=True + ) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "olora-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + @pytest.mark.parametrize( + "config_kwargs, should_warn", + [ + # no warning + ({"init_lora_weights": "pissa", "target_modules": ["linear"]}, False), + ({"init_lora_weights": "pissa_niter_3", "target_modules": ["linear"]}, False), + ({"init_lora_weights": "olora", "target_modules": ["linear"]}, False), + ({"init_lora_weights": "pissa", "target_modules": ["linear"], "use_rslora": True}, False), + ({"init_lora_weights": "pissa_niter_3", "target_modules": ["linear"], "use_rslora": True}, False), + ({"init_lora_weights": "olora", "target_modules": ["linear"], "use_rslora": True}, False), + ({"init_lora_weights": "pissa", "target_modules": ["linear"], "rank_pattern": {"linear": 8}}, False), + ( + {"init_lora_weights": "pissa_niter_3", "target_modules": ["linear"], "rank_pattern": {"linear": 8}}, + False, + ), + ({"init_lora_weights": "olora", "target_modules": ["linear"], "rank_pattern": {"linear": 8}}, False), + ({"init_lora_weights": "pissa", "target_modules": ["linear"], "alpha_pattern": {"linear": 8}}, False), + ( + {"init_lora_weights": "pissa_niter_3", "target_modules": ["linear"], "alpha_pattern": {"linear": 8}}, + False, + ), + ({"init_lora_weights": "olora", "target_modules": ["linear"], "alpha_pattern": {"linear": 8}}, False), + # warning + ( + { + "init_lora_weights": "pissa", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "pissa_niter_3", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "olora", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "pissa", + "target_modules": ["linear"], + "use_rslora": True, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "pissa_niter_3", + "target_modules": ["linear"], + "use_rslora": True, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "olora", + "target_modules": ["linear"], + "use_rslora": True, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "pissa", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "pissa_niter_3", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ( + { + "init_lora_weights": "olora", + "target_modules": ["linear"], + "use_rslora": True, + "rank_pattern": {"linear": 8}, + "alpha_pattern": {"linear": 8}, + }, + True, + ), + ], + ) + def test_lora_config_pissa_olora_warns(self, config_kwargs, should_warn, recwarn): + # Using post training conversion of modified base weights to restore their initial values (PiSSA, OLoRA) cannot + # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends + # this when they'll eventually call save_pretrained (i.e. if they'll pass + # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here. + msg = re.escape("Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion") + if should_warn: + LoraConfig(**config_kwargs) + assert len(recwarn.list) == 1 + with pytest.warns(UserWarning, match=msg): + LoraConfig(**config_kwargs) + else: + LoraConfig(**config_kwargs) + assert not recwarn.list + + @pytest.mark.parametrize("init_method", ["pissa", "olora"]) + @pytest.mark.parametrize("pissa_olora_loaded_first", [False, True]) + def test_load_pissa_olora_with_other_adapter_warns(self, init_method, pissa_olora_loaded_first, recwarn, tmp_path): + # Since PiSSA/OLoRA modifies the base weights, it should not be combined with other adapters. Check for a + # warning. See #2184. + + # create an adapter without PiSSA/OloRA + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, LoraConfig(init_lora_weights=True)) + model.save_pretrained(tmp_path / "adapter0") + del model + + # create a model with PiSSA/OLoRA + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, LoraConfig(init_lora_weights=init_method)) + model.save_pretrained(tmp_path / "adapter1") + del model + + # load the model + if pissa_olora_loaded_first: + path0, path1 = tmp_path / "adapter1", tmp_path / "adapter0" + else: + path0, path1 = tmp_path / "adapter0", tmp_path / "adapter1" + + model = AutoModelForCausalLM.from_pretrained(model_id) + model = PeftModel.from_pretrained(model, path0) + model = model.load_adapter(path1, adapter_name="other") + + if init_method == "pissa": + msg = "PiSSA changes the base weights of the model and should thus not be used with other adapters" + else: + msg = "OLoRA changes the base weights of the model and should thus not be used with other adapters" + assert any(str(w.message).startswith(msg) for w in recwarn.list) + + def test_lora_rslora_scaling(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + + # check scaling factor use_rslora=True + config = LoraConfig(target_modules=["linear", "embed", "conv2d"], lora_alpha=3, r=16, use_rslora=True) + model = get_peft_model(model, config) + + expected_scaling = config.lora_alpha / (config.r**0.5) + + assert model.linear.scaling["default"] == expected_scaling + assert model.embed.scaling["default"] == expected_scaling + assert model.conv2d.scaling["default"] == expected_scaling + + def test_lora_default_scaling_pattern(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + + # check scaling factor use_rslora=False with rank and alpha pattern + config = LoraConfig( + target_modules=["linear", "embed", "conv2d"], + rank_pattern={"embed": 9, "conv2d": 16}, + alpha_pattern={"linear": 11, "conv2d": 13}, + lora_alpha=17, + r=25, + use_rslora=False, + ) + model = get_peft_model(model, config) + + expected_scaling = { + "linear": config.alpha_pattern["linear"] / config.r, + "embed": config.lora_alpha / config.rank_pattern["embed"], + "conv2d": config.alpha_pattern["conv2d"] / config.rank_pattern["conv2d"], + } + + assert model.linear.scaling["default"] == expected_scaling["linear"] + assert model.embed.scaling["default"] == expected_scaling["embed"] + assert model.conv2d.scaling["default"] == expected_scaling["conv2d"] + + def test_lora_rslora_scaling_pattern(self): + # default is True + torch.manual_seed(0) + + model = self.get_model() + + # check scaling factor use_rslora=True with rank and alpha pattern + config = LoraConfig( + target_modules=["linear", "embed", "conv2d"], + rank_pattern={"embed": 9, "conv2d": 16}, + alpha_pattern={"linear": 11, "conv2d": 13}, + lora_alpha=17, + r=25, + use_rslora=True, + ) + model = get_peft_model(model, config) + + expected_scaling = { + "linear": config.alpha_pattern["linear"] / (config.r**0.5), + "embed": config.lora_alpha / (config.rank_pattern["embed"] ** 0.5), + "conv2d": config.alpha_pattern["conv2d"] / (config.rank_pattern["conv2d"] ** 0.5), + } + + assert model.linear.scaling["default"] == expected_scaling["linear"] + assert model.embed.scaling["default"] == expected_scaling["embed"] + assert model.conv2d.scaling["default"] == expected_scaling["conv2d"] + + def test_modules_to_save_targets_lora_layer_raises(self): + # There is no good reason to have auxiliary modules to target a LoRA layer. As auxiliary modules are applied + # *after* BaseTunerLayers, a possible way for this to happen accidentally is if the + # modules_to_save/trainable_token_indices coincide with the adapter name, e.g. if the adapter name is "foobar", + # we can have a module named model.base_model.model.self_attn.lora_A.foobar. If + # modules_to_save/trainable_token_indices is also "foobar", there would be a match. + # Note: Theoretically, a lot more PEFT methods support modules_to_save, so would have to be tested, but the code + # path is the same for all of them, so only testing LoRA. + model = self.get_model() + + config = LoraConfig( + target_modules=["linear"], + modules_to_save=["foobar"], + ) + msg = ( + "You are trying to target a module with that is a child of " + ". This is almost certainly not the intended behavior. Please " + "ensure that the adapter name, 'foobar', does not conflict with any of the targeted modules." + ) + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config, adapter_name="foobar") + + def test_trainable_token_indices_targets_lora_layer_raises(self): + # Same test as test_modules_to_save_targets_lora_layer_raises, but using trainable_token_indices + model = self.get_model() + + # check scaling factor use_rslora=True with rank and alpha pattern + config = LoraConfig(target_modules=["embed"], trainable_token_indices={"foobar": [1, 2, 3]}) + msg = ( + "You are trying to target a module with that is a child " + "of . This is almost certainly not the intended behavior. Please " + "ensure that the adapter name, 'foobar', does not conflict with any of the targeted modules." + ) + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config, adapter_name="foobar") + + @require_deterministic_for_xpu + def test_lora_use_dora_linear(self, data): + # check that dora is a no-op when initialized + torch.manual_seed(0) + model = self.get_model() + output_base, _, _ = model(data) + + # check scaling factor use_rslora=True + config = LoraConfig(target_modules=["linear"], use_dora=True) + model = get_peft_model(model, config) + + with model.disable_adapter(): + output_disabled, _, _ = model(data) + output_dora, _, _ = model(data) + + assert torch.allclose(output_base, output_disabled) + assert torch.allclose(output_base, output_dora) + + @require_deterministic_for_xpu + def test_lora_use_dora_linear_init_false(self, data): + # with init_lora_weights=False, dora should not be a no-op + torch.manual_seed(0) + model = self.get_model() + output_base, _, _ = model(data) + + # check scaling factor use_rslora=True + config = LoraConfig(target_modules=["linear"], use_dora=True, init_lora_weights=False) + model = get_peft_model(model, config) + + with model.disable_adapter(): + output_disabled, _, _ = model(data) + output_dora, _, _ = model(data) + + assert torch.allclose(output_base, output_disabled) + assert not torch.allclose(output_base, output_dora) + + def test_lora_use_dora_with_megatron_core_raises(self): + megatron_config = {"does-not": "matter-here"} + with pytest.raises(ValueError, match="DoRA does not support megatron_core"): + LoraConfig(target_modules=["linear"], use_dora=True, megatron_config=megatron_config) + + @pytest.fixture + def mha_cls(self): + class ModelMha(nn.Module): + def __init__(self, kdim=None, vdim=None): + super().__init__() + self.mha = nn.MultiheadAttention(10, 2, kdim=kdim, vdim=vdim) + self.lin0 = nn.Linear(10, 2) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = X.float() + X, _ = self.mha(X, X, X) + X = self.lin0(X) + X = self.sm(X) + return X + + return ModelMha + + def test_mha_load_init_model_first(self, mha_cls): + # This test used to fail and require a workaround, for more context, see: + # https://github.com/huggingface/peft/pull/1324#issuecomment-2252473980 + # The workaround was that _restore_weights had to be called manually on lora.MHA layers in order to make loading + # the state dict work. With recent changes, this workaround is no longer required, so that test has been + # deleted. + inputs = torch.rand(10, 10, 10) + model = mha_cls() + config = LoraConfig(target_modules=["mha"], init_lora_weights=False) + model = get_peft_model(model, config).eval() + restore_state_dict = {k: v.detach().cpu() for k, v in model.state_dict().items()} + + del model + + model = mha_cls() + model = get_peft_model(model, config) + # the workaround used to be: + # for module in model.modules(): + # if isinstance(module, peft.tuners.lora.layer.MultiheadAttention): + # module._restore_weights() + model(inputs) + model.load_state_dict(restore_state_dict) + + def test_mha_with_separate_qkv_embed_raises(self, mha_cls): + # passing different kdim and vdim results in separate parameters for q, k, v, which is not supported (yet) + model = mha_cls(kdim=20, vdim=30) + config = LoraConfig(target_modules=["mha"]) + msg = "Only same embed for query/key/value is supported as of now for MultiheadAttention" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_mha_with_dora_raises(self, mha_cls): + model = mha_cls() + config = LoraConfig(target_modules=["mha"], use_dora=True) + msg = re.escape("MultiheadAttention does not support DoRA (yet), please set use_dora to False") + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_mha_exposes_attributes(self, mha_cls): + # MHA requires a bunch of attributes to be exposed, try to check them exhaustively here + model = mha_cls() + embed_dim = model.mha.embed_dim + kdim = model.mha.kdim + vdim = model.mha.vdim + qkv_same_embed_dim = model.mha._qkv_same_embed_dim + num_heads = model.mha.num_heads + dropout = model.mha.dropout + batch_first = model.mha.batch_first + head_dim = model.mha.head_dim + in_proj_weight = model.mha.in_proj_weight + in_proj_bias = model.mha.in_proj_bias + out_proj = model.mha.out_proj + bias_k = model.mha.bias_k + bias_v = model.mha.bias_v + add_zero_attn = model.mha.add_zero_attn + + config = LoraConfig(target_modules=["mha"]) + peft_model = get_peft_model(model, config) + assert peft_model.base_model.mha.embed_dim == embed_dim + assert peft_model.base_model.mha.kdim == kdim + assert peft_model.base_model.mha.vdim == vdim + assert peft_model.base_model.mha._qkv_same_embed_dim == qkv_same_embed_dim + assert peft_model.base_model.mha.num_heads == num_heads + assert peft_model.base_model.mha.dropout == dropout + assert peft_model.base_model.mha.batch_first == batch_first + assert peft_model.base_model.mha.head_dim == head_dim + if in_proj_weight is not None: + assert torch.allclose(peft_model.base_model.mha.in_proj_weight, in_proj_weight) + else: + assert peft_model.base_model.mha.in_proj_weight is None + if in_proj_bias is not None: + assert torch.allclose(peft_model.base_model.mha.in_proj_bias, in_proj_bias) + else: + assert peft_model.base_model.mha.in_proj_bias is None + assert peft_model.base_model.mha.out_proj is out_proj + if bias_k is not None: + assert torch.allclose(peft_model.base_model.mha.bias_k, bias_k) + else: + assert peft_model.base_model.mha.bias_k is None + if bias_v is not None: + assert torch.allclose(peft_model.base_model.mha.bias_v, bias_v) + else: + assert peft_model.base_model.mha.bias_v is None + assert peft_model.base_model.mha.add_zero_attn == add_zero_attn + + def test_mha_merge_masks_method(self, mha_cls): + # MHA requires a merge_masks method to be exposed, check that it works + model = mha_cls() + config = LoraConfig(target_modules=["mha"]) + peft_model = get_peft_model(model, config) + + attn_mask = torch.randint(0, 2, (10, 10)) + key_padding_mask = torch.randint(0, 2, (10, 10)) + query = torch.rand(10, 10, 10) + merged_mask0, mask_type0 = model.mha.merge_masks(attn_mask, key_padding_mask, query) + merged_mask1, mask_type1 = peft_model.base_model.mha.merge_masks(attn_mask, key_padding_mask, query) + + assert torch.allclose(merged_mask0, merged_mask1) + assert mask_type0 == mask_type1 + + @pytest.mark.parametrize("bias", ["none", "all", "lora_only", "invalid"]) + def test_lora_with_bias_argument(self, bias): + model = self.get_model() + config = LoraConfig(target_modules=["linear", "conv2d"], bias=bias) + + if bias == "invalid": + with pytest.raises(NotImplementedError): + get_peft_model(model, config) + return + + model = get_peft_model(model, config) # does not raise + for name, param in model.named_parameters(): + if not name.endswith("bias"): + continue + if bias == "none": + assert param.requires_grad is False + elif bias == "all": + assert param.requires_grad is True + elif bias == "lora_only": + # only layers targeted with target_modules + assert param.requires_grad is ("linear" in name) or ("conv2d" in name) + + def test_lora_with_bias_extra_params(self): + # lora with lora_bias=True + model = self.get_model() + config = LoraConfig(target_modules=["linear", "conv2d"], lora_bias=False) + model_no_bias = get_peft_model(model, config) + + model = self.get_model() + config = LoraConfig(target_modules=["linear", "conv2d"], lora_bias=True) + model_bias = get_peft_model(model, config) + + # check that bias for LoRA B is set + assert model_no_bias.base_model.model.linear.lora_B["default"].bias is None + assert model_bias.base_model.model.linear.lora_B["default"].bias.shape == (1000,) + assert model_no_bias.base_model.model.conv2d.lora_B["default"].bias is None + assert model_bias.base_model.model.conv2d.lora_B["default"].bias.shape == (100,) + + # check that the same params are present except for the extra bias term + params_no_bias = {name for name, _ in model_no_bias.named_parameters()} + params_bias = {name for name, _ in model_bias.named_parameters()} + extra_params = { + "base_model.model.linear.lora_B.default.bias", + "base_model.model.conv2d.lora_B.default.bias", + } + assert params_bias - params_no_bias == extra_params + assert params_no_bias.issubset(params_bias) + + def test_lora_with_bias_embedding_raises(self): + # lora with lora_bias=True is not supported for embedding layers + model = self.get_model() + config = LoraConfig(target_modules=["embed"], lora_bias=True) + msg = "lora_bias=True is not supported for Embedding" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + @pytest.mark.parametrize( + "extra_kwargs", + [ + {"use_dora": True}, + {"init_lora_weights": "eva"}, + {"init_lora_weights": "gaussian"}, + {"init_lora_weights": "loftq", "loftq_config": LoftQConfig()}, + {"init_lora_weights": "olora"}, + {"init_lora_weights": "pissa"}, + {"init_lora_weights": "pissa_niter_3"}, + {"init_lora_weights": "orthogonal"}, + ], + ) + def test_lora_with_bias_incompatible_arguments(self, extra_kwargs): + # some arguments don't work in conjunction with lora_bias and should raise + # just check the common chunk of the error message + msg = "The argument lora_bias=True is" + with pytest.raises(ValueError, match=msg): + LoraConfig(target_modules=["linear"], lora_bias=True, **extra_kwargs) + + def test_lora_linear_with_bias_when_base_layer_has_no_bias_warns(self): + model = self.get_model(bias=False) + config = LoraConfig(target_modules=["linear"], lora_bias=True) + msg = re.escape("`lora_bias=True` was passed but the targeted layer of type Linear has no bias") + with pytest.warns(PeftWarning, match=msg): + get_peft_model(model, config) + + def test_lora_conv2d_with_bias_when_base_layer_has_no_bias_warns(self): + model = self.get_model(bias=False) + config = LoraConfig(target_modules=["conv2d"], lora_bias=True) + msg = re.escape("`lora_bias=True` was passed but the targeted layer of type Conv2d has no bias") + with pytest.warns(PeftWarning, match=msg): + get_peft_model(model, config) + + def test_lora_incompatible_mamba_modules(self): + # Ensure LoRA raises an error when applying to forbidden modules + # ('out_proj', 'conv1d') in Mamba-based architectures like Falcon-Mamba tiny. + model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-mamba-tiny-dev") + + config = LoraConfig( + task_type="CAUSAL_LM", + target_modules=["out_proj", "conv1d"], # Forbidden modules for Mamba-based models + ) + msg = "is incompatible with Mamba-based models" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def get_model_conv2d_groups(self): + class ModelConv2DGroups(nn.Module): + """For testing when groups argument is used in conv layer""" + + def __init__(self): + super().__init__() + self.conv2d = nn.Conv2d(16, 32, 3, padding=1, groups=2) + self.relu = nn.ReLU() + self.flat = nn.Flatten() + self.lin0 = nn.Linear(12800, 2) + self.sm = nn.LogSoftmax(dim=-1) + self.dtype = torch.float + + def forward(self, X): + # This is ignoring input since main usage is for checking raising of error when peft is applied + X = torch.arange(9 * 16 * 20 * 20).view([9, 16, 20, 20]).to(self.conv2d.weight.device) + X = X.to(self.dtype) + X = self.conv2d(X) + X = self.relu(X) + X = self.flat(X) + X = self.lin0(X) + X = self.sm(X) + return X + + return ModelConv2DGroups().eval().to(self.torch_device) + + @pytest.mark.parametrize( + "config_cls, config_kwargs", + [ + pytest.param(LoraConfig, {"r": 8, "target_modules": ["conv2d"]}, id="lora with rank divisible by groups"), + pytest.param(LoraConfig, {"r": 2, "target_modules": ["conv2d"]}, id="lora with rank equal to groups"), + pytest.param( + LoraConfig, {"r": 1, "target_modules": ["conv2d"]}, id="lora with rank not divisible by groups" + ), + pytest.param( + LoraConfig, + {"r": 8, "target_modules": ["conv2d"], "use_dora": True}, + id="dora with rank divisible by groups", + ), + pytest.param( + LoraConfig, + {"r": 2, "target_modules": ["conv2d"], "use_dora": True}, + id="dora with rank equal to groups", + ), + pytest.param( + LoraConfig, + {"r": 1, "target_modules": ["conv2d"], "use_dora": True}, + id="dora with rank not divisible by groups", + ), + ], + ) + def test_error_raised_if_rank_not_divisible_by_groups(self, config_cls, config_kwargs): + # This test checks if error is raised when rank is not divisible by groups for conv layer since + # currently, support is limited to conv layers where the rank is divisible by groups in lora and dora + base_model = self.get_model_conv2d_groups() + peft_config = config_cls(**config_kwargs) + r = config_kwargs["r"] + base_layer = base_model.conv2d + groups = base_layer.groups + if r % groups != 0: + with pytest.raises( + ValueError, + match=( + f"Targeting a {base_layer.__class__.__name__} with groups={base_layer.groups} and rank {r}. " + "Currently, support is limited to conv layers where the rank is divisible by groups. " + "Either choose a different rank or do not target this specific layer." + ), + ): + peft_model = get_peft_model(base_model, peft_config) + else: + # No error should be raised + peft_model = get_peft_model(base_model, peft_config) + + def test_target_module_and_target_parameter_on_same_layer(self): + # When targeting an nn.Parameter with LoRA using target_parameters, ensure that this is not already another LoRA + # layer (i.e. avoid double wrapping). + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 10) + + base_model = MyModule() + config = LoraConfig(target_modules=["linear"], target_parameters=["linear.weight"]) + msg = "Trying to wrap an `nn.Parameter` of layer 'linear' of type Linear, which is not a valid target." + with pytest.raises(ValueError, match=msg): + get_peft_model(base_model, config) + + @pytest.mark.parametrize("target_parameters", [["linear"], ["foobar"], ["foobar.weight"], ["foo", "bar"]]) + @pytest.mark.parametrize("target_modules", [None, [], ""]) + def test_valid_no_target_module_nor_target_parameter_match_raises(self, target_parameters, target_modules): + model = self.get_model() + config = LoraConfig(target_modules=target_modules, target_parameters=target_parameters) + msg = re.escape( + "No `target_modules` passed but also no `target_parameters` found. Please check the values for " + "these arguments." + ) + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_target_parameters_wrong_type_raises(self): + # Check that target_parameters being a string raises a useful error message -- this is an easy mistake to make + # because strings are allowed for target_modules + model = self.get_model() + msg = "`target_parameters` must be a list of strings or None." + with pytest.raises(TypeError, match=msg): + LoraConfig(target_parameters="linear.weight") + + def test_valid_target_parameters_invalid_target_modules_warns(self): + model = self.get_model() + config = LoraConfig(target_modules=["foobar"], target_parameters=["linear.weight"]) + msg = re.escape("target_modules={'foobar'} were set but no module was matched.") + with pytest.warns(RuntimeWarning, match=msg): + get_peft_model(model, config) + + def test_valid_target_modules_invalid_target_parameters_warns(self): + model = self.get_model() + config = LoraConfig(target_modules=["linear"], target_parameters=["foobar.weight"]) + msg = re.escape("target_parameters=['foobar.weight'] were set but no parameter was matched.") + with pytest.warns(RuntimeWarning, match=msg): + get_peft_model(model, config) + + def test_adding_multiple_adapters_with_target_parameters_raises(self): + model = self.get_model() + config = LoraConfig(target_modules=[], target_parameters=["linear.weight"]) + model = get_peft_model(model, config) + msg = re.escape("only one LoRA adapter per model with `target_parameters` is allowed") + with pytest.raises(ValueError, match=msg): + model.add_adapter(adapter_name="other", peft_config=config) + + def test_loading_loading_adapters_with_target_parameters_raises(self, tmp_path): + model = self.get_model() + config = LoraConfig(target_modules=[], target_parameters=["linear.weight"]) + model = get_peft_model(model, config) + model.save_pretrained(tmp_path) + + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path) + msg = re.escape("only one LoRA adapter per model with `target_parameters` is allowed") + with pytest.raises(ValueError, match=msg): + model.load_adapter(tmp_path, adapter_name="other") + + def test_multiple_configs_with_bias_raises(self, tmp_path): + # There cannot be more than one config with bias != "none". + # Note: This would need to be tested for all PEFT methods that support the bias parameter, but as this method + # comes from BaseTuner, it's fine to only check LoRA. + model = self.get_model() + config0 = LoraConfig(target_modules=["linear"], bias="all") + model = get_peft_model(model, config0) + + config1 = LoraConfig(target_modules=["linear"], bias="lora_only") + msg = "supports only 1 adapter with bias. When using multiple adapters" + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + # the invalid peft config was not added + assert len(model.peft_config) == 1 + + # it's okay to add a config with bias="none" (the default) + config2 = LoraConfig(target_modules=["linear"], bias="none") + model.add_adapter("other", config2) # does not raise + + +class TestLokrInitialization: + torch_device = infer_device() + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # Choose a large weight so that averages are close to expected values. + self.linear = nn.Linear(1000, 1000) + self.conv2d = nn.Conv2d(100, 100, 3) + + def forward(self, x): + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.conv2d(x_4d) + + return MyModule().eval().to(self.torch_device) + + @pytest.fixture + def data(self): + return torch.rand(10, 1000).to(self.torch_device) + + @require_deterministic_for_xpu + def test_lokr_linear_init_default(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = LoKrConfig(target_modules=["linear"]) + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert torch.allclose(output_before, output_after) + + def test_lokr_linear_init_false(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = LoKrConfig(target_modules=["linear"], init_weights=False) + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert not torch.allclose(output_before, output_after) + + @require_deterministic_for_xpu + def test_lokr_linear_init_lycoris(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = LoKrConfig(target_modules=["linear"], init_weights="lycoris") + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert torch.allclose(output_before, output_after) + + def test_lokr_conv2d_init_default(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[1] + config = LoKrConfig(target_modules=["conv2d"]) + model = get_peft_model(model, config) + output_after = model(data)[1] + + assert torch.allclose(output_before, output_after) + + def test_lokr_conv2d_init_false(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[1] + config = LoKrConfig(target_modules=["conv2d"], init_weights=False) + model = get_peft_model(model, config) + output_after = model(data)[1] + + assert not torch.allclose(output_before, output_after) + + def test_lokr_conv2d_init_lycoris(self, data): + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[1] + config = LoKrConfig(target_modules=["conv2d"], init_weights="lycoris") + model = get_peft_model(model, config) + output_after = model(data)[1] + + assert torch.allclose(output_before, output_after) + + +class TestAdaLoraInitialization: + torch_device = infer_device() + + def test_adalora_target_modules_set(self): + config = AdaLoraConfig(target_modules=["linear", "embed", "conv2d"], total_step=1) + assert config.target_modules == {"linear", "embed", "conv2d"} + + def test_adalora_use_dora_raises(self): + with pytest.raises(ValueError, match="ADALORA does not support DoRA"): + AdaLoraConfig(use_dora=True, total_step=1) + + def test_adalora_loftq_config_raises(self): + with pytest.raises(ValueError, match="ADALORA does not support LOFTQ"): + AdaLoraConfig(init_lora_weights="loftq", loftq_config={"loftq": "config"}, total_step=1) + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = nn.Linear(1000, 1000) + + def forward(self, x): + return self.linear(x) + + return MyModule().eval().to(self.torch_device) + + @pytest.fixture + def data(self): + return torch.rand(10, 1000).to(self.torch_device) + + @require_deterministic_for_xpu + def test_adalora_default_init_identity(self, data): + # default is True + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data) + config = AdaLoraConfig(target_modules=["linear"], total_step=1) + model = get_peft_model(model, config) + output_after = model(data) + assert torch.allclose(output_before, output_after) + + +class TestPromptTuningInitialization: + torch_device = infer_device() + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = nn.Linear(1000, 1000) + self.embed = nn.Embedding(1000, 1000) + self.conv2d = nn.Conv2d(100, 100, 3) + + def forward(self, x): + x_int = (100 * x).int() + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.embed(x_int), self.conv2d(x_4d) + + return MyModule().eval().to(self.torch_device) + + def test_use_prompt_tuning_init_text_raises(self): + with pytest.raises(ValueError, match="When prompt_tuning_init='TEXT', tokenizer_name_or_path can't be None"): + PromptTuningConfig(prompt_tuning_init="TEXT", prompt_tuning_init_text="prompt tuning init text") + with pytest.raises(ValueError, match="When prompt_tuning_init='TEXT', prompt_tuning_init_text can't be None"): + PromptTuningConfig(prompt_tuning_init="TEXT", tokenizer_name_or_path="t5-base") + + +class TestVeraInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + return X + + return MLP().to(self.torch_device) + + def test_vera_mixing_save_projection_raises(self): + # it is unclear what the right thing to do would be if some adapters save the projection weights and some don't + # so we better raise an error + + config0 = VeraConfig(target_modules=["lin0"], init_weights=False, save_projection=True) + model = self.get_model() + model = get_peft_model(model, config0) + config1 = VeraConfig(target_modules=["lin0"], init_weights=False, save_projection=False) + msg = re.escape( + "VeRA projection weights must be saved for all adapters or none, but got multiple different values: " + "[False, True]" + ) + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + def test_vera_add_second_adapter_with_incompatible_input_shape(self): + config0 = VeraConfig(target_modules=["lin0"], r=8) + config1 = VeraConfig(target_modules=["lin1"]) + + base_model = self.get_model() + lin0_in_feat = base_model.lin0.in_features + lin1_in_feat = base_model.lin1.in_features + model = get_peft_model(base_model, config0) + # not full message but enough to identify the error + msg = f"vera_A has a size of {lin0_in_feat} but {lin1_in_feat} or greater is required" + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + def test_vera_add_second_adapter_with_higher_rank(self): + rank0 = 123 + rank1 = 456 + config0 = VeraConfig(target_modules=["lin0"], r=rank0) + # second adapter has higher rank + config1 = VeraConfig(target_modules=["lin0"], r=rank1) + + model = get_peft_model(self.get_model(), config0) + # not full message but enough to identify the error + msg = f"vera_A has a size of {rank0} but {rank1} or greater is required" + with pytest.raises(ValueError, match=msg): + model.add_adapter("other", config1) + + +class TestVBLoraInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 30, bias=bias) + self.lin1 = nn.Linear(30, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + return X + + return MLP().to(self.torch_device) + + def test_vblora_with_incompatible_vector_length_with_in_features(self): + vector_length = 3 + model = self.get_model() + config = VBLoRAConfig(target_modules=["lin0"], vector_length=vector_length) + msg = f"`in_features` {model.lin0.in_features} must be divisible by `vector_length` {vector_length}" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_vblora_with_incompatible_vector_length_with_out_features(self): + vector_length = 3 + model = self.get_model() + config = VBLoRAConfig(target_modules=["lin1"], vector_length=vector_length) + msg = f"`out_features` {model.lin1.out_features} must be divisible by `vector_length` {vector_length}" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + +class TestC3AInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 30, bias=bias) + self.lin1 = nn.Linear(30, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + return X + + return MLP().to(self.torch_device) + + def test_c3a_with_incompatible_block_size_with_in_features(self): + block_size = 3 + model = self.get_model() + config = C3AConfig(target_modules=["lin0"], block_size=block_size) + msg = f"The block size should be a factor of the input size. However, the input size is {model.lin0.in_features} and the block size is {block_size}" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_c3a_with_incompatible_block_size_with_out_features(self): + block_size = 3 + model = self.get_model() + config = C3AConfig(target_modules=["lin1"], block_size=block_size) + msg = f"The block size should be a factor of the output size. However, the output size is {model.lin1.out_features} and the block size is {block_size}" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + +class TestWaveFTInitialization: + """Test class to check the initialization of WaveFT adapters.""" + + torch_device = infer_device() + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # Choose a large weight so that averages are close to expected values. + self.linear = nn.Linear(1000, 1000) + self.conv2d = nn.Conv2d(100, 100, 3) + + def forward(self, x): + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.conv2d(x_4d) + + return MyModule().eval().to(self.torch_device) + + @pytest.fixture + def data(self): + return torch.rand(10, 1000).to(self.torch_device) + + @require_deterministic_for_xpu + def test_waveft_linear_init_default(self, data): + # Default initialization should result in no change to output (zeros initialization) + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = WaveFTConfig(target_modules=["linear"], n_frequency=100, init_weights=True) + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert torch.allclose(output_before, output_after, atol=1e-6) + + def test_waveft_linear_init_false(self, data): + # With init_weights=False, output should change (random initialization) + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = WaveFTConfig(target_modules=["linear"], n_frequency=100, init_weights=False) + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert not torch.allclose(output_before, output_after, atol=1e-6) + + @require_deterministic_for_xpu + def test_waveft_linear_with_scaling(self, data): + # Test that scaling parameter affects output correctly + torch.manual_seed(0) + + model = self.get_model() + output_before = model(data)[0] + config = WaveFTConfig(target_modules=["linear"], n_frequency=100, init_weights=False, scaling=10.0) + model = get_peft_model(model, config) + output_after = model(data)[0] + + assert not torch.allclose(output_before, output_after, atol=1e-6) + + @require_deterministic_for_xpu + def test_waveft_different_wavelet_families(self, data): + # Test different wavelet families + torch.manual_seed(0) + + model1 = self.get_model() + config1 = WaveFTConfig(target_modules=["linear"], n_frequency=100, wavelet_family="db1", init_weights=False) + model1 = get_peft_model(model1, config1) + output1 = model1(data)[0] + + torch.manual_seed(0) + model2 = self.get_model() + config2 = WaveFTConfig(target_modules=["linear"], n_frequency=100, wavelet_family="sym2", init_weights=False) + model2 = get_peft_model(model2, config2) + output2 = model2(data)[0] + + # Different wavelet families should produce different results + assert not torch.allclose(output1, output2, atol=1e-6) + + @require_deterministic_for_xpu + def test_waveft_use_idwt_flag(self, data): + # Test use_idwt flag + torch.manual_seed(0) + + model1 = self.get_model() + config1 = WaveFTConfig(target_modules=["linear"], n_frequency=100, use_idwt=True, init_weights=False) + model1 = get_peft_model(model1, config1) + output1 = model1(data)[0] + + torch.manual_seed(0) + model2 = self.get_model() + config2 = WaveFTConfig(target_modules=["linear"], n_frequency=100, use_idwt=False, init_weights=False) + model2 = get_peft_model(model2, config2) + output2 = model2(data)[0] + + # Different use_idwt settings should produce different results + assert not torch.allclose(output1, output2, atol=1e-6) + + def test_waveft_non_positive_n_frequency_raises(self): + # Test that n_frequency <= 0 raises appropriate error + model = self.get_model() + + # Test with n_frequency = 0 + n_frequency = 0 + msg = f"`n_frequency` should be a positive integer value but the value passed is {n_frequency}" + with pytest.raises(ValueError, match=re.escape(msg)): + config = WaveFTConfig(target_modules=["linear"], n_frequency=n_frequency) + get_peft_model(model, config) + + # Test with negative n_frequency + n_frequency = -1 + msg = f"`n_frequency` should be a positive integer value but the value passed is {n_frequency}" + with pytest.raises(ValueError, match=re.escape(msg)): + config = WaveFTConfig(target_modules=["linear"], n_frequency=n_frequency) + get_peft_model(model, config) + + def test_waveft_excessive_n_frequency_raises(self): + # Test that n_frequency > in_features * out_features raises appropriate error + model = self.get_model() + + # The model has a linear layer with 1000 in_features and 1000 out_features + # So the maximum n_frequency should be 1000 * 1000 = 1,000,000 + max_allowed = 1000 * 1000 + n_frequency = max_allowed + 1 + msg = ( + f"`n_frequency` should be less than or equal to the product of the input and output dimensions " + f"but the value passed is {n_frequency} and the product is {max_allowed}" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + config = WaveFTConfig(target_modules=["linear"], n_frequency=n_frequency) + get_peft_model(model, config) + + def test_waveft_n_frequency_pattern(self, data): + # Test n_frequency_pattern functionality + torch.manual_seed(0) + + model = self.get_model() + config = WaveFTConfig( + target_modules=["linear"], n_frequency=50, n_frequency_pattern={"linear": 100}, init_weights=True + ) + model = get_peft_model(model, config) + + # Check that the pattern was applied + waveft_layer = model.base_model.model.linear + assert hasattr(waveft_layer, "waveft_n_frequency") + assert waveft_layer.waveft_n_frequency["default"] == 100 + + def test_waveft_layers_pattern_without_layers_to_transform_raises(self): + # Test that when layers_pattern is specified, layers_to_transform must also be specified + msg = "When `layers_pattern` is specified, `layers_to_transform` must also be specified." + with pytest.raises(ValueError, match=re.escape(msg)): + WaveFTConfig(target_modules=["linear"], layers_pattern=["layers"], layers_to_transform=None) + + def test_waveft_invalid_wavelet_family_raises(self): + # Test that invalid wavelet families raise appropriate errors + invalid_family = "invalid_wavelet" + msg = f"Wavelet family {invalid_family} not supported. Supported wavelet families are:" + with pytest.raises(ValueError, match=re.escape(msg)): + WaveFTConfig(target_modules=["linear"], wavelet_family=invalid_family) + + +class TestRoadInitialization: + torch_device = infer_device() + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 30, bias=bias) + self.lin1 = nn.Linear(30, 2, bias=bias) + + def forward(self, X): + X = self.lin0(X) + X = self.lin1(X) + return X + + return MLP().to(self.torch_device) + + def get_conv2d_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = nn.Linear(1000, 1000) + self.embed = nn.Embedding(1000, 1000) + self.conv2d = nn.Conv2d(100, 100, 3) + + def forward(self, x): + x_int = (100 * x).int() + x_4d = x.flatten().reshape(1, 100, 10, 10) + return self.linear(x), self.embed(x_int), self.conv2d(x_4d) + + return MyModule().eval().to(self.torch_device) + + def test_road_default_initialization(self): + torch.manual_seed(0) + model = self.get_model() + config = RoadConfig(target_modules=["lin0"], group_size=2) + model = get_peft_model(model, config) + weight_alpha = model.lin0.road_alpha["default"].data + weight_theta = model.lin0.road_theta["default"].data + torch.allclose(weight_alpha, torch.ones_like(weight_alpha)) + torch.allclose(weight_theta, torch.zeros_like(weight_theta)) + + def test_road_with_odd_group_size(self): + group_size = 3 # odd values are not allowed + msg = f"The group_size must be divisible by 2 when using RoadLayer, but got {group_size}." + with pytest.raises(ValueError, match=re.escape(msg)): + RoadConfig(group_size=group_size) + + def test_road_with_too_large_group_size(self): + group_size = 64 # larger than out_features + msg = ( + f"The out_features of the base layer must be divisible by group_size ({group_size}) when using RoadLayer." + ) + model = self.get_model() + config = RoadConfig(target_modules=["lin0"], group_size=group_size) + with pytest.raises(ValueError, match=re.escape(msg)): + get_peft_model(model, config) + + def test_road_with_incompatible_group_size_with_out_features(self): + group_size = 4 # even, but 30 does not divide by 4 + model = self.get_model() + config = RoadConfig(target_modules=["lin0"], group_size=group_size) + msg = ( + f"The out_features of the base layer must be divisible by group_size ({group_size}) when using RoadLayer." + ) + with pytest.raises(ValueError, match=re.escape(msg)): + get_peft_model(model, config) + + def test_road_with_conv2d_layer(self): + model = self.get_conv2d_model() + config = RoadConfig(target_modules=["conv2d"], group_size=2) + msg = "Target module Conv2d(100, 100, kernel_size=(3, 3), stride=(1, 1)) is not supported. Currently, only the following modules are supported: `torch.nn.Linear`." + with pytest.raises(ValueError, match=re.escape(msg)): + get_peft_model(model, config) + + +class TestNoInfiniteRecursionDeepspeed: + # see #1892 for details + classes = [ + PeftModel, + PeftMixedModel, + PeftModelForSequenceClassification, + PeftModelForQuestionAnswering, + PeftModelForTokenClassification, + PeftModelForCausalLM, + PeftModelForSeq2SeqLM, + PeftModelForFeatureExtraction, + ] + + @pytest.fixture + def wrap_init(self): + # emulates the wrapper from DeepSpeed + import functools + + def decorator(f): + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + hasattr(self, "abc") # any hasattr will do + f(self, *args, **kwargs) + + return wrapper + + return decorator + + @pytest.fixture + def model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 10) + # to emulate LMs: + self.prepare_inputs_for_generation = None + self._prepare_encoder_decoder_kwargs_for_generation = None + + return MyModule() + + @pytest.mark.parametrize("cls", classes) + def test_no_infinite_recursion(self, cls, model, wrap_init): + original_init = cls.__init__ + try: + cls.__init__ = wrap_init(cls.__init__) + # this would trigger an infinite loop before the fix in 1892 + cls(model, LoraConfig(target_modules=["linear"])) + finally: + # ensure there are no side effects of this test + cls.__init__ = original_init + + +class TestLoadAdapterOfflineMode: + base_model = "hf-internal-testing/tiny-random-OPTForCausalLM" + peft_model_id = "peft-internal-testing/tiny-OPTForCausalLM-lora" + + # make sure that PEFT honors offline mode + @contextmanager + def hub_offline_ctx(self): + # this is required to simulate offline mode, setting the env var dynamically inside the test does not work + # because the value is checked only once at the start of the session + + if reset_sessions is None: + # this means we're using huggingface_hub >= 1.0.0, there is no need to call reset_sessions() anymore + with patch("huggingface_hub.constants.HF_HUB_OFFLINE", True): + yield + else: + # in huggingface_hub < 1.0.0, it's necessary to reset the session + # TODO: remove once huggingface_hub < 1.0.0 is no longer supported + with patch("huggingface_hub.constants.HF_HUB_OFFLINE", True): + reset_sessions() + yield + reset_sessions() + + def test_load_from_hub_then_offline_model(self): + # this uses LoRA but it's the same mechanism for other methods + base_model = AutoModelForCausalLM.from_pretrained(self.base_model) + + # first ensure that the adapter model has been downloaded + PeftModel.from_pretrained(base_model, self.peft_model_id) + + del base_model + + base_model = AutoModelForCausalLM.from_pretrained(self.base_model) + with self.hub_offline_ctx(): + # does not raise + PeftModel.from_pretrained(base_model, self.peft_model_id) + + @pytest.fixture + def changed_default_cache_dir(self, tmp_path, monkeypatch): + # ensure that this test does not interact with other tests that may use the HF cache + monkeypatch.setattr("huggingface_hub.constants.HF_HOME", tmp_path) + monkeypatch.setattr("huggingface_hub.constants.HF_HUB_CACHE", tmp_path / "hub") + monkeypatch.setattr("huggingface_hub.constants.HF_TOKEN_PATH", tmp_path / "token") + + def load_checkpoints(self, cache_dir): + # download model and lora checkpoint to a specific cache dir + snapshot_download(self.base_model, cache_dir=cache_dir) + snapshot_download(self.peft_model_id, cache_dir=cache_dir) + + def test_load_checkpoint_offline_non_default_cache_dir(self, changed_default_cache_dir, tmp_path): + # See #2373 for context + self.load_checkpoints(tmp_path) + with self.hub_offline_ctx(): + base_model = AutoModelForCausalLM.from_pretrained(self.base_model, cache_dir=tmp_path) + PeftModel.from_pretrained(base_model, self.peft_model_id, cache_dir=tmp_path) + + +class TestCustomModelConfigWarning: + # Check potential warnings when the user provided base_model_name_or_path is overridden by PEFT. See #2001 for + # context. We use LoRA for this test but the same applies to other methods + @pytest.fixture + def custom_module(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(10, 10) + + return MyModule() + + def test_no_warning_by_default_transformers_model(self, recwarn): + # first a sanity test that there is no warning by default when using a model from transformers + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + get_peft_model(model, LoraConfig()) + for warning in recwarn.list: + assert "renamed" not in str(warning.message) + + def test_no_warning_by_default_custom_model(self, custom_module, recwarn): + # same as above but with a custom model + get_peft_model(custom_module, LoraConfig(target_modules=["lin"])) + for warning in recwarn.list: + assert "renamed" not in str(warning.message) + + def test_warning_name_transformers_model(self, recwarn): + # The base_model_name_or_path provided by the user is overridden. + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + custom_name = "custom_name" + get_peft_model(model, LoraConfig(base_model_name_or_path=custom_name)) + msg = f"was renamed from '{custom_name}' to 'hf-internal-testing/tiny-random-OPTForCausalLM'" + assert any(msg in str(warning.message) for warning in recwarn.list) + + def test_warning_name_custom_model(self, custom_module, recwarn): + custom_name = "custom_name" + get_peft_model(custom_module, LoraConfig(target_modules=["lin"], base_model_name_or_path=custom_name)) + msg = f"was renamed from '{custom_name}' to 'None'" + assert any(msg in str(warning.message) for warning in recwarn.list) + + def test_warning_name_custom_model_with_custom_name(self, custom_module, recwarn): + custom_name = "custom_name" + custom_module.name_or_path = "foobar" + get_peft_model(custom_module, LoraConfig(target_modules=["lin"], base_model_name_or_path=custom_name)) + msg = f"was renamed from '{custom_name}' to 'foobar'" + assert any(msg in str(warning.message) for warning in recwarn.list) + + +class TestLowCpuMemUsage: + """Test for the low CPU memory usage option for loading PEFT models. + + Note that we have `test_load_model_low_cpu_mem_usage` in the custom model and stable diffusion tests. Those are + broad tests (i.e. testing all the supported PEFT methods) but not very deep (only testing if loading works and the + device is correctly set). The test class here goes deeper but only tests LoRA, as checking all PEFT methods would + be too much. + + """ + + # test on CPU and optionally on accelerator device + devices = ["cpu"] + _device = infer_device() + if _device != "cpu": + devices.append(_device) + + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + + def get_model(self): + return AutoModelForCausalLM.from_pretrained(self.model_id) + + @pytest.fixture(scope="class") + def lora_config(self): + return LoraConfig(init_lora_weights=False, target_modules="all-linear") + + @pytest.fixture(scope="class") + def lora_path(self, tmp_path_factory, lora_config): + torch.manual_seed(0) + tmp_path = tmp_path_factory.mktemp("lora") + model = self.get_model() + model = get_peft_model(model, lora_config) + model.save_pretrained(tmp_path) + return tmp_path + + @pytest.fixture(scope="class") + def inputs(self): + return {"input_ids": torch.randint(0, 100, (1, 10)), "attention_mask": torch.ones(1, 10)} + + @pytest.mark.parametrize("device", devices) + def test_from_pretrained_low_cpu_mem_usage_works(self, device, inputs, lora_path): + model = self.get_model().to(device) + inputs = {k: v.to(device) for k, v in inputs.items()} + model = PeftModel.from_pretrained(model, lora_path, torch_device=device).eval() + device_set_not_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_not_low_cpu_mem = model(**inputs).logits + + del model + + model = self.get_model().to(device) + model = PeftModel.from_pretrained(model, lora_path, low_cpu_mem_usage=True, torch_device=device).eval() + device_set_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_low_cpu_mem = model(**inputs).logits + + assert device_set_low_cpu_mem == device_set_not_low_cpu_mem + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem, atol=1e-6, rtol=1e-6) + + @pytest.mark.parametrize("device", devices) + def test_load_adapter_low_cpu_mem_usage_works(self, device, inputs, lora_path, lora_config): + model = self.get_model().to(device) + inputs = {k: v.to(device) for k, v in inputs.items()} + + torch.manual_seed(0) + model = get_peft_model(model, lora_config) + model.load_adapter(lora_path, adapter_name="other", torch_device=device) + model.set_adapter("other") + model.eval() + device_set_not_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_not_low_cpu_mem = model(**inputs).logits + + del model + + model = self.get_model().to(device) + torch.manual_seed(0) + model = get_peft_model(model, lora_config) + model.load_adapter(lora_path, adapter_name="other", low_cpu_mem_usage=True, torch_device=device) + model.set_adapter("other") + model.eval() + device_set_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_low_cpu_mem = model(**inputs).logits + + assert device_set_low_cpu_mem == device_set_not_low_cpu_mem + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem, atol=1e-6, rtol=1e-6) + + @pytest.mark.parametrize("device", devices) + def test_get_peft_model_low_cpu_mem_usage_works(self, device, inputs): + # when calling get_peft_model, the PEFT weights will not be initialized on device but remain on meta + model = self.get_model().to(device) + model = get_peft_model(model, LoraConfig(target_modules="all-linear"), low_cpu_mem_usage=True) + + devices_lora_weights = {p.device for n, p in model.named_parameters() if "lora_" in n} + expected = {torch.device("meta")} + assert devices_lora_weights == expected + + @pytest.mark.parametrize("device", devices) + def test_get_peft_model_with_task_type_low_cpu_mem_usage_works(self, device, inputs): + # same as the previous test, but pass the task_type argument + model = self.get_model().to(device) + model = get_peft_model( + model, LoraConfig(target_modules="all-linear", task_type="CAUSAL_LM"), low_cpu_mem_usage=True + ) + + devices_lora_weights = {p.device for n, p in model.named_parameters() if "lora_" in n} + expected = {torch.device("meta")} + assert devices_lora_weights == expected + + @pytest.mark.parametrize("device", devices) + def test_inject_adapter_low_cpu_mem_usage_works(self, device, inputs, lora_path, lora_config): + # external libs like transformers and diffusers use inject_adapter_in_model, let's check that this also works + model = self.get_model().to(device) + inputs = {k: v.to(device) for k, v in inputs.items()} + + torch.manual_seed(0) + model = get_peft_model(model, lora_config) + model.load_adapter(lora_path, adapter_name="other", torch_device=device) + model.set_adapter("other") + model.eval() + device_set_not_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_not_low_cpu_mem = model(**inputs).logits + + del model + + torch.manual_seed(0) + model = self.get_model().to(device) + inject_adapter_in_model(lora_config, model, low_cpu_mem_usage=True) + device_set_before_loading = {p.device.type for p in model.parameters()} + # at this stage, lora weights are still on meta device + assert device_set_before_loading == {"meta", device} + + state_dict = load_file(lora_path / "adapter_model.safetensors") + remapped_dict = {} + prefix = "base_model.model." + for key, val in state_dict.items(): + new_key = key[len(prefix) :] + remapped_dict[new_key] = val.to(device) + errors = set_peft_model_state_dict(model, remapped_dict, low_cpu_mem_usage=True) + # sanity check: no unexpected keys + assert not errors.unexpected_keys + + model.eval() + device_set_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_low_cpu_mem = model(**inputs).logits + + assert device_set_low_cpu_mem == device_set_not_low_cpu_mem + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem, atol=1e-6, rtol=1e-6) + + ############################ + # tests for PeftMixedModel # + ############################ + + @pytest.mark.parametrize("device", devices) + def test_mixed_model_from_pretrained_low_cpu_mem_usage_works(self, device, inputs, lora_path): + model = self.get_model().to(device) + inputs = {k: v.to(device) for k, v in inputs.items()} + model = PeftMixedModel.from_pretrained(model, lora_path, torch_device=device).eval() + device_set_not_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_not_low_cpu_mem = model(**inputs).logits + + del model + + model = self.get_model().to(device) + model = PeftMixedModel.from_pretrained(model, lora_path, low_cpu_mem_usage=True, torch_device=device).eval() + device_set_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_low_cpu_mem = model(**inputs).logits + + assert device_set_low_cpu_mem == device_set_not_low_cpu_mem + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem, atol=1e-6, rtol=1e-6) + + @pytest.mark.parametrize("device", devices) + def test_mixed_model_load_adapter_low_cpu_mem_usage_works(self, device, inputs, lora_path, lora_config): + model = self.get_model().to(device) + inputs = {k: v.to(device) for k, v in inputs.items()} + + torch.manual_seed(0) + model = PeftModel.from_pretrained(model, lora_path) + model.load_adapter(lora_path, adapter_name="other", torch_device=device) + model.set_adapter("other") + model.eval() + device_set_not_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_not_low_cpu_mem = model(**inputs).logits + + del model + + model = self.get_model().to(device) + torch.manual_seed(0) + model = PeftModel.from_pretrained(model, lora_path) + model.load_adapter(lora_path, adapter_name="other", low_cpu_mem_usage=True, torch_device=device) + model.set_adapter("other") + model.eval() + device_set_low_cpu_mem = {p.device.type for p in model.parameters()} + logits_low_cpu_mem = model(**inputs).logits + + assert device_set_low_cpu_mem == device_set_not_low_cpu_mem + assert torch.allclose(logits_low_cpu_mem, logits_not_low_cpu_mem, atol=1e-6, rtol=1e-6) + + +def test_from_pretrained_missing_keys_warning(recwarn, tmp_path): + # For more context, see issue 2115 + # When loading a PEFT adapter and we're missing a PEFT-specific weight, there should be a warning. + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + config = LoraConfig() + model = get_peft_model(model, config) + state_dict = model.state_dict() + + # first, sanity check that there are no warnings if no key is missing + model.save_pretrained(tmp_path) + del model + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + model = PeftModel.from_pretrained(model, tmp_path) + msg = "Found missing adapter keys" + assert not any(msg in str(w.message) for w in recwarn.list) + + # remove a key from the state_dict + missing_key = "base_model.model.model.decoder.layers.0.self_attn.v_proj.lora_A.default.weight" + + def new_state_dict(): + return {k: v for k, v in state_dict.items() if k != missing_key} + + model.state_dict = new_state_dict + model.save_pretrained(tmp_path) + del model + + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + model = PeftModel.from_pretrained(model, tmp_path) + assert any(msg in str(w.message) for w in recwarn.list) + assert any(missing_key in str(w.message) for w in recwarn.list) + + +class TestNamingConflictWarning: + """ + Tests for warnings related to naming conflicts between adapter names and tuner prefixes. References: Issue 2252 + """ + + @pytest.fixture(autouse=True) + def setup(self): + self.peft_config = LoraConfig() + self.prefix = PEFT_TYPE_TO_PREFIX_MAPPING[self.peft_config.peft_type] + self.base_model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-OPTForCausalLM") + + def _save_and_reload_model(self, model, adapter_name, tmp_path): + # Helper method to save and reload the PEFT model + model.save_pretrained(tmp_path, selected_adapters=[adapter_name]) + del model + reloaded_base_model = AutoModelForCausalLM.from_pretrained(tmp_path / adapter_name) + return PeftModel.from_pretrained(reloaded_base_model, tmp_path / adapter_name) + + def test_no_warning_without_naming_conflict_get_peft_model(self, recwarn): + # No warning should be raised when there is no naming conflict during get_peft_model. + non_conflict_adapter = "adapter" + _ = get_peft_model(self.base_model, self.peft_config, adapter_name=non_conflict_adapter) + expected_msg = f"Adapter name '{non_conflict_adapter}' should not be contained in the prefix '{self.prefix}'." + assert not any(expected_msg in str(w.message) for w in recwarn.list) + + def test_no_warning_without_naming_conflict_add_adapter(self, recwarn): + # No warning should be raised when adding an adapter without naming conflict. + non_conflict_adapter = "adapter" + other_non_conflict_adapter = "other_adapter" + model = get_peft_model(self.base_model, self.peft_config, adapter_name=non_conflict_adapter) + _ = model.add_adapter(other_non_conflict_adapter, self.peft_config) + expected_msg = ( + f"Adapter name '{other_non_conflict_adapter}' should not be contained in the prefix '{self.prefix}'." + ) + assert not any(expected_msg in str(w.message) for w in recwarn.list) + + def test_no_warning_without_naming_conflict_save_and_load(self, recwarn, tmp_path): + # No warning should be raised when saving and loading the model without naming conflict. + non_conflict_adapter = "adapter" + model = get_peft_model(self.base_model, self.peft_config, adapter_name=non_conflict_adapter) + _ = self._save_and_reload_model(model, non_conflict_adapter, tmp_path) + expected_msg = f"Adapter name '{non_conflict_adapter}' should not be contained in the prefix '{self.prefix}'." + assert not any(expected_msg in str(w.message) for w in recwarn.list) + + def test_warning_naming_conflict_get_peft_model(self, recwarn): + # Warning is raised when the adapter name conflicts with the prefix in get_peft_model. + conflicting_adapter_name = self.prefix[:-1] + _ = get_peft_model(self.base_model, self.peft_config, adapter_name=conflicting_adapter_name) + expected_msg = ( + f"Adapter name '{conflicting_adapter_name}' should not be contained in the prefix '{self.prefix}'." + ) + assert any(expected_msg in str(w.message) for w in recwarn.list) + + def test_warning_naming_conflict_add_adapter(self, recwarn): + # Warning is raised when adding an adapter with a name that conflicts with the prefix. + conflicting_adapter = self.prefix[1:] + non_conflict_adapter = "adapter" + model = get_peft_model(self.base_model, self.peft_config, adapter_name=non_conflict_adapter) + _ = model.add_adapter(conflicting_adapter, self.peft_config) + expected_msg = f"Adapter name '{conflicting_adapter}' should not be contained in the prefix '{self.prefix}'." + assert any(expected_msg in str(w.message) for w in recwarn.list) + + def test_warning_naming_conflict_save_and_load(self, recwarn, tmp_path): + # Warning is raised when saving and loading the model with a naming conflict. + conflicting_adapter = self.prefix[:-1] + model = get_peft_model(self.base_model, self.peft_config, adapter_name=conflicting_adapter) + _ = self._save_and_reload_model(model, conflicting_adapter, tmp_path) + expected_msg = f"Adapter name '{conflicting_adapter}' should not be contained in the prefix '{self.prefix}'." + assert any(expected_msg in str(w.message) for w in recwarn.list) + + +class TestCordaInitialization: + """Test class to check the initialization of CorDA adapters.""" + + torch_device = infer_device() + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + # choose a large weight so that averages are close to expected values + self.linear = nn.Linear(1000, 1000) + + def forward(self, x): + return self.linear(x) + + return MyModule().eval().to(self.torch_device) + + @pytest.fixture + def data(self): + # larger data is required to pass KPM test + torch.manual_seed(233) + return torch.rand(1000, 1000).to(self.torch_device) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_no_redundant_fields(self, data, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + + corda_config = CordaConfig( + corda_method=corda_method, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: model(data), + hooked_model=model, + ) + peft_model = get_peft_model(model, config) + + # check if the redundant fields are removed + assert not hasattr(peft_model.base_model.linear, "sample_count") + assert not hasattr(peft_model.base_model.linear, "covariance_matrix") + assert not hasattr(peft_model.base_model.linear, "corda_method") + assert not hasattr(peft_model.base_model.linear, "rank") + assert not hasattr(peft_model.base_model.linear, "eigens") + + # legacy debug fields + assert not hasattr(peft_model.base_model.linear, "mean") + assert not hasattr(peft_model.base_model.linear, "std") + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_sample_count(self, data, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + + corda_config = CordaConfig( + corda_method=corda_method, + prune_temporary_fields=False, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: [model(data), model(data)], # running model twice to test `sample_count` + hooked_model=model, + ) + + # covariance of linear should be data.T @ data + layer = model.linear + assert hasattr(layer, "covariance_matrix") + assert torch.allclose(layer.covariance_matrix, data.T @ data, atol=1e-06) + + # sample count of linear should be 2 + assert hasattr(layer, "sample_count") + assert layer.sample_count == 2 + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_hook_unregister(self, data, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + + hook_call_count = 0 + + def hook(*args): + nonlocal hook_call_count + hook_call_count += 1 + + model.linear.register_forward_hook(hook) + + corda_config = CordaConfig( + corda_method=corda_method, + prune_temporary_fields=False, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: model(data), + hooked_model=model, + ) + + # after preprocessing, external and internal hook should be run once + assert hook_call_count == 1 + assert model.linear.sample_count == 1 + + # run preprocessed model once + model(data)[0] + + # the external hook should be kept, but the internal hook should be gone + assert hook_call_count == 2 + assert model.linear.sample_count == 1 + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_linear_init_default(self, data, tmp_path, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + output_base = model(data)[0] + + corda_config = CordaConfig( + cache_file=tmp_path / "corda_cache.pt", + covariance_file=tmp_path / "covariance_cache.pt", + corda_method=corda_method, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: model(data), + hooked_model=model, + ) + peft_model = get_peft_model(model, config) + + # check if adapter performs an identity transformantion + assert torch.allclose(output_base, peft_model(data)[0], atol=1e-06) + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # if load SVD result from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(cache_file=tmp_path / "corda_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + # if load covariance from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(covariance_file=tmp_path / "covariance_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_hooked_model_linear_init_default(self, data, tmp_path, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + hooked_model = deepcopy(model) + output_base = model(data)[0] + + corda_config = CordaConfig( + cache_file=tmp_path / "corda_cache.pt", + covariance_file=tmp_path / "covariance_cache.pt", + corda_method=corda_method, + ) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + + # difference from the above test: this test uses a copied model as hooked model + preprocess_corda( + model, + config, + run_model=lambda: hooked_model(data), + hooked_model=hooked_model, + ) + peft_model = get_peft_model(model, config) + + # check if adapter performs an identity transformantion + assert torch.allclose(output_base, peft_model(data)[0], atol=1e-06) + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # if load SVD result from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(cache_file=tmp_path / "corda_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + # if load covariance from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(covariance_file=tmp_path / "covariance_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_linear_init_default_with_rank_pattern(self, data, tmp_path, corda_method): + original_model = self.get_model() + model = deepcopy(original_model) + output_base = model(data)[0] + + corda_config = CordaConfig( + cache_file=tmp_path / "corda_cache.pt", + covariance_file=tmp_path / "covariance_cache.pt", + corda_method=corda_method, + ) + config = LoraConfig( + rank_pattern={"linear": 8, "embed": 16, "conv2d": 32}, + init_lora_weights="corda", + target_modules=["linear"], + corda_config=corda_config, + ) + preprocess_corda( + model, + config, + run_model=lambda: model(data), + ) + peft_model = get_peft_model(model, config) + + # check if adapter performs an identity transformantion + assert torch.allclose(output_base, peft_model(data)[0], atol=1e-06) + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # if load SVD result from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + rank_pattern={"linear": 8, "embed": 16, "conv2d": 32}, + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(cache_file=tmp_path / "corda_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + # if load covariance from cache, the output should be the same + model = deepcopy(original_model) + config = LoraConfig( + rank_pattern={"linear": 8, "embed": 16, "conv2d": 32}, + init_lora_weights="corda", + target_modules=["linear"], + corda_config=CordaConfig(covariance_file=tmp_path / "covariance_cache.pt", corda_method=corda_method), + ) + preprocess_corda(model, config) + peft_model = get_peft_model(model, config) + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + assert torch.allclose(output_corda, peft_model(data)[0], atol=1e-06) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_conversion_same_output_after_loading(self, data, tmp_path, corda_method): + model = self.get_model() + output_base = model(data)[0] + + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig(init_lora_weights="corda", target_modules=["linear"], r=8, corda_config=corda_config) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "corda" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "corda-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_corda, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_config_keys_before = list(peft_model.peft_config.keys()) + peft_config_dict_before = peft_model.peft_config["default"].to_dict() + peft_model.save_pretrained( + tmp_path / "corda-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + peft_config_keys_after = list(peft_model.peft_config.keys()) + peft_config_dict_after = peft_model.peft_config["default"].to_dict() + assert peft_config_keys_before == peft_config_keys_after + assert peft_config_dict_before == peft_config_dict_after + + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_corda, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_conversion_same_output_after_loading_with_rank_pattern(self, data, tmp_path, corda_method): + # same as above, but using rank_pattern + model = self.get_model() + output_base = model(data)[0] + + # use rank_pattern here; note that since there is only a single linear layer, r is completely overridden + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + r=8, + rank_pattern={"linear": 32}, + corda_config=corda_config, + ) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "corda" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "corda-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_corda, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 32 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "corda-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_corda, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 64 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_conversion_same_output_after_loading_with_alpha_pattern(self, data, tmp_path, corda_method): + # same as above, but using alpha_pattern + model = self.get_model() + output_base = model(data)[0] + + # use alpha_pattern here; note that since there is only a single linear layer, lora_alpha is completely + # overridden + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + alpha_pattern={"linear": 5}, + corda_config=corda_config, + ) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "corda" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "corda-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_corda, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 5 / 8 + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "corda-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_corda, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + assert model_converted.base_model.model.linear.scaling["default"] == 10 / 16 + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_conversion_same_output_after_loading_with_rslora(self, data, tmp_path, corda_method): + model = self.get_model() + output_base = model(data)[0] + + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig( + init_lora_weights="corda", target_modules=["linear"], r=8, use_rslora=True, corda_config=corda_config + ) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(deepcopy(model), config) + # save the initial model + peft_model.peft_config["default"].init_lora_weights = True + peft_model.save_pretrained(tmp_path / "init-model") + peft_model.peft_config["default"].init_lora_weights = "corda" + + # modify the weights, or else the adapter performs an identity transformation + peft_model.base_model.linear.lora_B["default"].weight.data *= 2.0 + output_corda = peft_model(data)[0] + + # sanity check + tol = 1e-06 + assert not torch.allclose(output_base, output_corda, atol=tol, rtol=tol) + + # save the model normally + peft_model.save_pretrained(tmp_path / "corda-model") + model_loaded = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model") + output_loaded = model_loaded(data)[0] + + assert torch.allclose(output_corda, output_loaded, atol=tol, rtol=tol) + # sanity check: ranks should still be 8 as initially + assert model_loaded.peft_config["default"].r == 8 + assert model_loaded.base_model.model.linear.lora_A["default"].weight.shape[0] == 8 + assert model_loaded.base_model.model.linear.scaling["default"] == 8 / (8**0.5) + # sanity check: the base model weights were indeed changed + assert not torch.allclose( + model.linear.weight, model_loaded.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + # save the model with conversion + peft_model.save_pretrained( + tmp_path / "corda-model-converted", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + model_converted = PeftModel.from_pretrained(deepcopy(model), tmp_path / "corda-model-converted") + output_converted = model_converted(data)[0] + + assert torch.allclose(output_corda, output_converted, atol=tol, rtol=tol) + # rank should be double of what it was initially + assert model_converted.peft_config["default"].r == 16 + assert model_converted.base_model.model.linear.lora_A["default"].weight.shape[0] == 16 + # same scale as before with a little bit of floating point imprecision + assert model_converted.base_model.model.linear.scaling["default"] == pytest.approx(8 / (8**0.5)) + # base model weights should be the same as the initial model + assert torch.allclose( + model.linear.weight, model_converted.base_model.model.linear.base_layer.weight, atol=tol, rtol=tol + ) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_rank_pattern_and_rslora_raises(self, data, tmp_path, corda_method): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + r=8, + rank_pattern={"linear": 2}, + use_rslora=True, + corda_config=corda_config, + ) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "corda-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + @pytest.mark.parametrize("corda_method", ("ipm", "kpm")) + def test_lora_corda_alpha_pattern_and_rslora_raises(self, data, tmp_path, corda_method): + # it's not possible to determine the correct scale when using rslora with rank or alpha pattern, because the + # scale is not stored in the state_dict + model = self.get_model() + corda_config = CordaConfig(corda_method=corda_method) + config = LoraConfig( + init_lora_weights="corda", + target_modules=["linear"], + r=8, + alpha_pattern={"linear": 2}, + use_rslora=True, + corda_config=corda_config, + ) + preprocess_corda(model, config, run_model=lambda: model(data), hooked_model=model) + peft_model = get_peft_model(model, config) + peft_model.save_pretrained(tmp_path / "init-model") + + msg = re.escape("Passing `path_initial_model_for_weight_conversion` to `save_pretrained`") + with pytest.raises(ValueError, match=msg): + peft_model.save_pretrained( + tmp_path / "corda-model", path_initial_model_for_weight_conversion=tmp_path / "init-model" + ) + + +class TestEvaInitialization: + """Tests for the EVA (Explained Variance Adaptation) initialization method. + + This test suite verifies: + 1. Consistency of initialization across different seeds + 2. Proper error handling for invalid inputs + 3. Compatibility with different model architectures + 4. Reproducibility of results + 5. Proper handling of edge cases + """ + + # Constants for test configuration + COSINE_SIMILARITY_THRESHOLD = 0.75 + NUM_SEEDS = 2 + BATCH_SIZE = 4 + MAX_LENGTH = 256 + LORA_DIM = 8 + LORA_ALPHA = 1 + DEVICE = infer_device() + # for caching purposes: + _dataset = load_dataset_english_quotes()["train"] + + @pytest.fixture + def tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + @pytest.fixture + def dataset(self, tokenizer): + # concatenate examples + examples = [] + example = "" + for data in self._dataset: + if len(example) >= self.MAX_LENGTH: + examples.append(example) + example = "" + example = example + " " + data["quote"] + dataset = Dataset.from_dict({"text": examples}) + # tokenize + dataset = dataset.map( + lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=self.MAX_LENGTH), + batched=True, + remove_columns=dataset.column_names, + ) + dataset.set_format(type="torch") + return dataset + + @pytest.fixture + def model(self): + model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") + model.transformer.h = model.transformer.h[:2] # truncate to 2 layers + return model.to(self.DEVICE) + + @pytest.fixture + def peft_config(self): + return LoraConfig( + r=self.LORA_DIM, + lora_alpha=self.LORA_ALPHA, + target_modules=["c_attn"], + init_lora_weights="eva", + eva_config=EvaConfig(rho=2), + ) + + @staticmethod + def collate_fn(examples): + return {k: torch.stack([v[k] for v in examples], dim=0) for k in examples[0].keys()} + + @staticmethod + def prepare_layer_inputs_fn(layer_input, model_input, layer_name): + return layer_input[0].view(-1, layer_input[0].size(-1)) + + def get_dataloader(self, dataset): + return DataLoader( + dataset, + batch_size=self.BATCH_SIZE, + collate_fn=self.collate_fn, + shuffle=False, + ) + + @pytest.mark.parametrize( + "prepare_layer_inputs_keys, expected_outcome", + [ + (None, "success"), + (["transformer.h.0.attn.c_attn"], "success"), + ( + ["transformer.h.0.attn.c_attn", "transformer.h.1.attn.c_attn", "transformer.h.2.attn.c_attn"], + "value_error", + ), + ], + ) + def test_eva_state_dict_prepare_inputs_mapping( + self, model, dataset, peft_config, prepare_layer_inputs_keys, expected_outcome + ): + """ + Tests for cases where prepare_layer_inputs_fn is a mapping. Checks that if not all target modules are present, + the prepare_layer_inputs_fn for the remaining modules is set to None. Also checks that if more keys than target + modules are present, a ValueError is raised. + """ + + def fn(x, *args): + return x[0].view(-1, x[0].size(-1)) + + if prepare_layer_inputs_keys is None: + prepare_layer_inputs_fn = fn + else: + prepare_layer_inputs_fn = {k: fn for k in prepare_layer_inputs_keys} + + shuffled_dataset = dataset.shuffle(seed=0) + dataloader = self.get_dataloader(shuffled_dataset) + modified_peft_config = deepcopy(peft_config) + modified_peft_config.eva_config.tau = 0 # converge immediately + if expected_outcome == "success": + sd = get_eva_state_dict( + model, + dataloader, + modified_peft_config, + prepare_model_inputs_fn=None, + prepare_layer_inputs_fn=prepare_layer_inputs_fn, + ) + assert len(sd) == 2 + assert "transformer.h.0.attn.c_attn" in sd + assert "transformer.h.1.attn.c_attn" in sd + else: + with pytest.raises( + ValueError, match="prepare_layer_inputs_fn is a mapping but the following module names were not found" + ): + get_eva_state_dict( + model, + dataloader, + modified_peft_config, + prepare_model_inputs_fn=None, + prepare_layer_inputs_fn=prepare_layer_inputs_fn, + ) + + @pytest.mark.parametrize( + "eva_config", + [EvaConfig(rho=2, adjust_scaling_factors=True)], + ) + def test_eva_state_dict_adjust_scaling_factors(self, model, dataset, peft_config, eva_config): + """ + Tests that the scaling factors are adjusted so that all LoRA gradients have the same scale regardless of their + rank. + """ + modified_peft_config = deepcopy(peft_config) + modified_peft_config.eva_config = eva_config + dataloader = self.get_dataloader(dataset) + peft_model = get_peft_model(deepcopy(model), modified_peft_config) + scaling_factors_before = {} + for n, m in peft_model.named_modules(): + if isinstance(m, LoraLayer): + scaling_factors_before[n] = m.scaling["default"] + initialize_lora_eva_weights(peft_model, dataloader) + for n, m in peft_model.named_modules(): + if isinstance(m, LoraLayer): + assert m.scaling["default"] == scaling_factors_before[n] + + @pytest.mark.parametrize( + "eva_config", + [ + # note: lower tau to decrease number of iterations until convergence, as tests are slow on CPU + EvaConfig(rho=2, tau=0.9), + EvaConfig(rho=1, tau=0.9), + EvaConfig(rho=1, whiten=True, tau=0.9), + EvaConfig(rho=1.0001, tau=0.9), + ], + ) + def test_eva_initialization_consistency(self, model, dataset, peft_config, eva_config): + """ + Tests that the state dict returned by `get_eva_state_dict` is consistent across different seeds based on the + cosine similarity of the svd components. + """ + modified_peft_config = deepcopy(peft_config) + modified_peft_config.eva_config = eva_config + state_dicts = [] + for seed in range(self.NUM_SEEDS): + shuffled_dataset = dataset.shuffle(seed=seed) + dataloader = self.get_dataloader(shuffled_dataset) + sd = get_eva_state_dict(model, dataloader, modified_peft_config, show_progress_bar=False) + state_dicts.append(sd) + + cos_sims = defaultdict(list) + for i, j in itertools.combinations(range(self.NUM_SEEDS), 2): + for k, v1 in state_dicts[i].items(): + v2 = state_dicts[j][k] + min_size = min(v1.size(0), v2.size(0)) + cos_sims[k].extend(torch.cosine_similarity(v1[:min_size].abs(), v2[:min_size].abs(), dim=1).tolist()) + + mean_cosine_similarities = {k: torch.tensor(v).mean() for k, v in cos_sims.items()} + for layer_name, mean_cosine_similarity in mean_cosine_similarities.items(): + assert mean_cosine_similarity > self.COSINE_SIMILARITY_THRESHOLD, ( + f"Mean absolute cosine similarity {mean_cosine_similarity:.4f} " + f"is not greater than {self.COSINE_SIMILARITY_THRESHOLD}" + ) + + @pytest.mark.parametrize("has_rank_zero", [True, False]) + def test_load_eva_state_dict(self, model, dataset, peft_config, tmp_path, has_rank_zero): + """ + Tests that the `eva_state_dict` argument in `initialize_lora_eva_weights` can be used to initialize a model + with EVA weights and that the initialized model can be saved and loaded correctly. + """ + dataloader = self.get_dataloader(dataset) + peft_model = get_peft_model(deepcopy(model), peft_config) + sd = get_eva_state_dict(peft_model, dataloader) + if has_rank_zero: + k = "base_model.model.transformer.h.0.attn.c_attn" + sd[k] = sd[k][:0] + initialize_lora_eva_weights(peft_model, eva_state_dict=sd) + if has_rank_zero: + assert not isinstance(peft_model.model.transformer.h[0].attn.c_attn, LoraLayer) + else: + assert isinstance(peft_model.model.transformer.h[0].attn.c_attn, LoraLayer) + peft_model.save_pretrained(tmp_path) + peft_model = PeftModel.from_pretrained(model, tmp_path, torch_device=self.DEVICE, low_cpu_mem_usage=True) + peft_model(**{k: v.to(self.DEVICE) for k, v in next(iter(dataloader)).items()}) + + def test_missing_eva_inits(self, model, dataset, peft_config): + """ + Tests that a warning is raised when some adapter modules were not initialized with EVA weights. + """ + modified_peft_config = deepcopy(peft_config) + modified_peft_config.target_modules = ["wte"] + dataloader = self.get_dataloader(dataset) + peft_model = get_peft_model(deepcopy(model), modified_peft_config) + with pytest.warns( + UserWarning, + match="the following layers were initialized with init_lora_weights=True because they were not found in the eva state_dict:*", + ): + initialize_lora_eva_weights(peft_model, dataloader) + + def test_load_eva_model(self, model, dataset, peft_config, tmp_path): + """ + Tests that a model initialized with EVA weights can be loaded correctly. + """ + dataloader = self.get_dataloader(dataset) + peft_model = get_peft_model(deepcopy(model), peft_config) + initialize_lora_eva_weights(peft_model, dataloader) + peft_model.save_pretrained(tmp_path) + peft_model = PeftModel.from_pretrained(model, tmp_path, torch_device=self.DEVICE, low_cpu_mem_usage=True) + peft_model(**{k: v.to(self.DEVICE) for k, v in next(iter(dataloader)).items()}) + + def test_eva_initialization_with_invalid_dataloader(self, model, peft_config): + """Test that appropriate error is raised when dataloader is empty.""" + empty_dataset = Dataset.from_dict({"text": []}) + dataloader = self.get_dataloader(empty_dataset) + + with pytest.raises(ValueError, match="dataloader is empty"): + get_eva_state_dict(model, dataloader, peft_config) + + def test_eva_config_rho(self): + """ + Tests that EvaConfig.__init__ raises a ValueError when rho is negative. + """ + with pytest.raises(ValueError, match="`rho` must be >= 1.0"): + EvaConfig(rho=-1) + + def test_eva_config_tau(self): + """ + Tests that EvaConfig.__init__ raises a ValueError when tau is not between 0.0 and 1.0. + """ + with pytest.raises(ValueError, match="`tau` must be between 0.0 and 1.0."): + EvaConfig(tau=-0.1) + with pytest.raises(ValueError, match="`tau` must be between 0.0 and 1.0."): + EvaConfig(tau=1.1) + + def test_lora_config_raises_warning_with_eva_init_but_not_eva_config(self): + """ + Tests that LoraConfig.__init__ raises a warning when init_lora_weights='eva' but eva_config is not set. + """ + with pytest.warns( + UserWarning, + match="`init_lora_weights` is 'eva' but `eva_config` is not specified. Using default EVA config.", + ): + LoraConfig(init_lora_weights="eva") + + def test_lora_config_raises_warning_with_eva_config_but_not_eva_init(self): + """ + Tests that LoraConfig.__init__ raises a warning when init_lora_weights is not 'eva' but eva_config is set. + """ + with pytest.warns( + UserWarning, match="`eva_config` specified but will be ignored when `init_lora_weights` is not 'eva'." + ): + LoraConfig(init_lora_weights=True, eva_config=EvaConfig()) + + +@pytest.mark.skipif( + platform.system() != "Linux", reason="Out of the box, torch.compile does not work on Windows or MacOS" +) +class TestHotSwapping: + """Tests for the hotswapping function""" + + torch_device = infer_device() + + def compile(self, model, do_compile): + if not do_compile: + return model + return torch.compile(model) + + def get_model(self): + class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=True) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 5, bias=False) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + return X + + torch.manual_seed(0) + return MLP().to(self.torch_device) + + def get_model_conv2d(self): + class ConvModel(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv2d(3, 10, kernel_size=3) + + def forward(self, X): + return self.conv(X) + + torch.manual_seed(0) + return ConvModel().to(self.torch_device) + + # this works with all adapters except prompt learning, but we don't test all + # as it is unnecessary and would be slow + @pytest.mark.parametrize( + "config", + [ + LoraConfig(init_lora_weights=0, target_modules=["lin0"]), + LoraConfig(init_lora_weights=0, target_modules=["lin0", "lin1"]), + ], + ) + @pytest.mark.parametrize("do_compile", [False, True]) + def test_hotswap_works(self, config, do_compile, tmp_path): + # Load 2 different adapters and check that we can hotswap between them, with the model optionally being + # compiled. + atol, rtol = 1e-4, 1e-4 + inputs = torch.rand(3, 10).to(self.torch_device) + + # create adapter 0 + model = self.get_model() + torch.manual_seed(0) + model = get_peft_model(model, config) + model = self.compile(model, do_compile=do_compile) + model.eval() + with torch.inference_mode(): + output0 = model(inputs) + model.save_pretrained(tmp_path / "adapter0") + + del model + + # create adapter 1 + model = self.get_model() + torch.manual_seed(1) + model = get_peft_model(model, config) + model = self.compile(model, do_compile=do_compile) + model.eval() + with torch.inference_mode(): + output1 = model(inputs) + model.save_pretrained(tmp_path / "adapter1") + + # sanity check: they're not the same + assert not torch.allclose(output0, output1, atol=atol, rtol=rtol) + + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + model = self.compile(model, do_compile=do_compile) + with torch.inference_mode(): + output_loaded0 = model(inputs) + + # sanity check: same output after loading for adapter 0 + assert torch.allclose(output0, output_loaded0, atol=atol, rtol=rtol) + + # hotswap with adapter 1 + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + with torch.inference_mode(): + output_loaded1 = model(inputs) + + # real check: model now behaves like adapter 1 + assert torch.allclose(output1, output_loaded1, atol=atol, rtol=rtol) + + # hotswap back to adapter 0 + hotswap_adapter(model, tmp_path / "adapter0", adapter_name="default") + with torch.inference_mode(): + output_loaded_back0 = model(inputs) + + # real check: model now behaves again like adapter 0 + assert torch.allclose(output0, output_loaded_back0, atol=atol, rtol=rtol) + + def test_hotswap_different_peft_types_raises(self, tmp_path): + # When the configs of the two adapters are different PEFT methods, raise + config0 = LoraConfig(target_modules=["lin0"]) + config1 = IA3Config(target_modules=["lin0"], feedforward_modules=[]) + + model = self.get_model() + model = get_peft_model(model, config0) + model.save_pretrained(tmp_path / "adapter0") + del model + + model = self.get_model() + model = get_peft_model(model, config1) + model.save_pretrained(tmp_path / "adapter1") + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + + msg = r"Incompatible PEFT types found: LORA and IA3" + with pytest.raises(ValueError, match=msg): + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + + def test_hotswap_wrong_peft_types_raises(self, tmp_path): + # Only LoRA is supported at the moment + config0 = IA3Config(target_modules=["lin0"], feedforward_modules=[]) + config1 = IA3Config(target_modules=["lin0"], feedforward_modules=[]) + + model = self.get_model() + model = get_peft_model(model, config0) + model.save_pretrained(tmp_path / "adapter0") + del model + + model = self.get_model() + model = get_peft_model(model, config1) + model.save_pretrained(tmp_path / "adapter1") + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + + msg = r"Hotswapping only supports LORA but IA3 was passed" + with pytest.raises(ValueError, match=msg): + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + + def test_hotswap_missing_key_works(self, tmp_path): + # When a key is missing, it is fine, the extra weight is zeroed out + config = LoraConfig(target_modules=["lin0", "lin1"]) + + model = self.get_model() + model = get_peft_model(model, config) + model.save_pretrained(tmp_path / "adapter0") + del model + + model = self.get_model() + model = get_peft_model(model, config) + + # remove one key from the state_dict + key = "base_model.model.lin1.lora_A.default.weight" + state_dict = model.state_dict() + del state_dict[key] + model.state_dict = lambda: state_dict + model.save_pretrained(tmp_path / "adapter1") + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + + # sanity check: the missing weight is not already all zeros + assert not (model.base_model.model.lin1.lora_A["default"].weight == 0).all() + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + # after hotswapping, it is zeroed out + assert (model.base_model.model.lin1.lora_A["default"].weight == 0).all() + + def test_hotswap_extra_key_raises(self, tmp_path): + # When there is an extra key, raise + config = LoraConfig(target_modules=["lin0"]) + + model = self.get_model() + model = get_peft_model(model, config) + model.save_pretrained(tmp_path / "adapter0") + del model + + model = self.get_model() + model = get_peft_model(model, config) + + # add an unexpected key + state_dict = model.state_dict() + new_key = "base_model.model.lin1.lora_A.default.weight" + state_dict[new_key] = torch.zeros(8, 20) + model.state_dict = lambda: state_dict + model.save_pretrained(tmp_path / "adapter1") + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + + msg = f"Hot swapping the adapter did not succeed, unexpected keys found: {new_key}" + with pytest.raises(RuntimeError, match=msg): + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + + @pytest.mark.parametrize("ranks", [(7, 13), (13, 7)]) + def test_hotswap_works_different_ranks_alphas(self, ranks, tmp_path): + # same as test_hotswap_works but different rank and alpha + # Load 2 different adapters and check that we can hotswap between them, with the model optionally being + # compiled. + atol, rtol = 1e-4, 1e-4 + inputs = torch.rand(3, 10).to(self.torch_device) + + # create adapter 0 + config0 = LoraConfig(target_modules=["lin0", "lin1"], r=ranks[0], lora_alpha=ranks[0], init_lora_weights=False) + model = self.get_model() + torch.manual_seed(0) + model = get_peft_model(model, config0) + model.eval() + with torch.inference_mode(): + output0 = model(inputs) + model.save_pretrained(tmp_path / "adapter0") + + del model + + # create adapter 1 + config1 = LoraConfig(target_modules=["lin0"], r=ranks[1], lora_alpha=ranks[1], init_lora_weights=False) + model = self.get_model() + torch.manual_seed(1) + model = get_peft_model(model, config1) + model.eval() + with torch.inference_mode(): + output1 = model(inputs) + model.save_pretrained(tmp_path / "adapter1") + + # sanity check: they're not the same + assert not torch.allclose(output0, output1, atol=atol, rtol=rtol) + + del model + + # load adapter 0 + model = self.get_model() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + with torch.inference_mode(): + output_loaded0 = model(inputs) + + # sanity check: same output after loading for adapter 0 + assert torch.allclose(output0, output_loaded0, atol=atol, rtol=rtol) + + # hotswap with adapter 1 + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + with torch.inference_mode(): + output_loaded1 = model(inputs) + + # real check: model now behaves like adapter 1 + assert torch.allclose(output1, output_loaded1, atol=atol, rtol=rtol) + + # hotswap back to adapter 0 + hotswap_adapter(model, tmp_path / "adapter0", adapter_name="default") + with torch.inference_mode(): + output_loaded_back0 = model(inputs) + + # real check: model now behaves again like adapter 0 + assert torch.allclose(output0, output_loaded_back0, atol=atol, rtol=rtol) + + @pytest.mark.parametrize("ranks", [(7, 13), (13, 7)]) + def test_hotswap_works_different_ranks_alphas_conv2d(self, ranks, tmp_path): + # same as previous test, but for a Conv2d model + atol, rtol = 1e-4, 1e-4 + inputs = torch.rand(3, 3, 10, 10).to(self.torch_device) + + # create adapter 0 + config0 = LoraConfig(target_modules=["conv"], r=ranks[0], init_lora_weights=False) + model = self.get_model_conv2d() + torch.manual_seed(0) + model = get_peft_model(model, config0) + model.eval() + with torch.inference_mode(): + output0 = model(inputs) + model.save_pretrained(tmp_path / "adapter0") + + del model + + # create adapter 1 + config1 = LoraConfig(target_modules=["conv"], r=ranks[1], init_lora_weights=False) + model = self.get_model_conv2d() + torch.manual_seed(1) + model = get_peft_model(model, config1) + model.eval() + with torch.inference_mode(): + output1 = model(inputs) + model.save_pretrained(tmp_path / "adapter1") + + # sanity check: they're not the same + assert not torch.allclose(output0, output1, atol=atol, rtol=rtol) + + del model + + # load adapter 0 + model = self.get_model_conv2d() + model = PeftModel.from_pretrained(model, tmp_path / "adapter0") + with torch.inference_mode(): + output_loaded0 = model(inputs) + + # sanity check: same output after loading for adapter 0 + assert torch.allclose(output0, output_loaded0, atol=atol, rtol=rtol) + + # hotswap with adapter 1 + hotswap_adapter(model, tmp_path / "adapter1", adapter_name="default") + with torch.inference_mode(): + output_loaded1 = model(inputs) + + # real check: model now behaves like adapter 1 + assert torch.allclose(output1, output_loaded1, atol=atol, rtol=rtol) + + # hotswap back to adapter 0 + hotswap_adapter(model, tmp_path / "adapter0", adapter_name="default") + with torch.inference_mode(): + output_loaded_back0 = model(inputs) + + # real check: model now behaves again like adapter 0 + assert torch.allclose(output0, output_loaded_back0, atol=atol, rtol=rtol) + + def test_prepare_model_for_compiled_hotswap_scalings_are_tensors(self): + config = LoraConfig(target_modules=["lin0", "lin1"]) + model = self.get_model() + model = get_peft_model(model, config) + + # sanity check: all scalings are floats + scalings_before = {} + for name, module in model.named_modules(): + if hasattr(module, "scaling"): + for key, val in module.scaling.items(): + assert isinstance(val, float) + scalings_before[f"{name}.{key}"] = val + + prepare_model_for_compiled_hotswap(model) + + scalings_after = {} + for name, module in model.named_modules(): + if hasattr(module, "scaling"): + for key, val in module.scaling.items(): + assert isinstance(val, torch.Tensor) + scalings_after[f"{name}.{key}"] = val.item() + + assert scalings_before == scalings_after + + def test_prepare_model_for_compiled_hotswap_rank_padding_works(self): + old_rank = 8 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank) + model = self.get_model() + model = get_peft_model(model, config) + + # sanity check + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == old_rank + elif "lora_B" in name: + assert param.shape[1] == old_rank + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == new_rank + elif "lora_B" in name: + assert param.shape[1] == new_rank + + def test_prepare_model_for_compiled_hotswap_same_rank_padding_works(self): + # same as previous test, but ensure there is no error if the rank to pad to is the same + old_rank = 8 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank) + model = self.get_model() + model = get_peft_model(model, config) + prepare_model_for_compiled_hotswap(model, target_rank=old_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == old_rank + elif "lora_B" in name: + assert param.shape[1] == old_rank + + def test_prepare_model_for_compiled_hotswap_conv2d_rank_padding_works(self): + # same as previous test, but for a Conv2d model + old_rank = 8 + config = LoraConfig(target_modules=["conv"], r=old_rank) + model = self.get_model_conv2d() + model = get_peft_model(model, config) + + # sanity check + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == old_rank + elif "lora_B" in name: + assert param.shape[1] == old_rank + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == new_rank + elif "lora_B" in name: + assert param.shape[1] == new_rank + + def test_prepare_model_for_compiled_hotswap_lower_rank_padding_raises(self): + # when trying to pad to a lower rank, raise an error + old_rank0 = 8 + old_rank1 = 10 + new_rank = 9 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank0, rank_pattern={"lin1": old_rank1}) + model = self.get_model() + model = get_peft_model(model, config) + + msg = re.escape("Trying to pad the adapter to the target rank 9, but the original rank is larger (10)") + with pytest.raises(ValueError, match=msg): + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + def test_prepare_model_for_compiled_hotswap_with_rank_pattern(self): + old_rank0 = 8 + old_rank1 = 9 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank0, rank_pattern={"lin1": old_rank1}) + model = self.get_model() + model = get_peft_model(model, config) + + # sanity check + for name, param in model.named_parameters(): + if "lora_A" in name: + if "lin0" in name: + assert param.shape[0] == old_rank0 + else: + assert param.shape[0] == old_rank1 + elif "lora_B" in name: + if "lin0" in name: + assert param.shape[1] == old_rank0 + else: + assert param.shape[1] == old_rank1 + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name: + assert param.shape[0] == new_rank + elif "lora_B" in name: + assert param.shape[1] == new_rank + + def test_prepare_model_for_compiled_hotswap_model_already_compiled_raises(self): + config = LoraConfig(target_modules=["lin0"]) + model = self.get_model() + model = get_peft_model(model, config) + model = torch.compile(model, mode="reduce-overhead") + + msg = re.escape("Call prepare_model_for_compiled_hotswap *before* compiling the model") + with pytest.raises(ValueError, match=msg): + prepare_model_for_compiled_hotswap(model) + + def test_prepare_model_for_compiled_hotswap_model_already_compiled_warns(self, recwarn): + config = LoraConfig(target_modules=["lin0"]) + model = self.get_model() + model = get_peft_model(model, config) + model = torch.compile(model, mode="reduce-overhead") + + msg = "prepare_model_for_compiled_hotswap was called with a model that is already compiled" + prepare_model_for_compiled_hotswap(model, check_compiled="warn") + assert any(msg in str(w.message) for w in recwarn) + + def test_prepare_model_for_compiled_hotswap_model_already_compiled_ignore(self, recwarn): + config = LoraConfig(target_modules=["lin0"]) + model = self.get_model() + model = get_peft_model(model, config) + model = torch.compile(model, mode="reduce-overhead") + + msg = "prepare_model_for_compiled_hotswap was called with a model that is already compiled" + prepare_model_for_compiled_hotswap(model, check_compiled="ignore") + # no error, no warning + assert not any(msg in str(w.message) for w in recwarn) + + def test_prepare_model_for_compiled_hotswap_model_already_compiled_wrong_argument(self, recwarn): + config = LoraConfig(target_modules=["lin0"]) + model = self.get_model() + model = get_peft_model(model, config) + model = torch.compile(model, mode="reduce-overhead") + + msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-option' instead.") + with pytest.raises(ValueError, match=msg): + prepare_model_for_compiled_hotswap(model, check_compiled="wrong-option") + + def test_prepare_model_for_compiled_hotswap_model_no_adapter_raises(self): + model = self.get_model() + msg = re.escape("No adapter layers found on the model") + with pytest.raises(ValueError, match=msg): + prepare_model_for_compiled_hotswap(model) + + def test_prepare_model_for_compiled_hotswap_does_not_change_output(self): + # preparing the model for hotswapping should not change the model output + inputs = torch.rand(3, 10).to(self.torch_device) + model = self.get_model().eval() + with torch.inference_mode(): + output_base = model(inputs) + + old_rank = 8 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank, init_lora_weights=False) + model = get_peft_model(model, config).eval() + with torch.inference_mode(): + output_before = model(inputs) + + # sanity check: LoRA changed output + assert not torch.allclose(output_base, output_before) + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + with torch.inference_mode(): + output_after = model(inputs) + + assert torch.allclose(output_before, output_after) + + def test_prepare_model_for_compiled_hotswap_does_not_change_output_conv2d(self): + # preparing the model for hotswapping should not change the model output + inputs = torch.rand(3, 3, 10, 10).to(self.torch_device) + model = self.get_model_conv2d().eval() + with torch.inference_mode(): + output_base = model(inputs) + + old_rank = 8 + config = LoraConfig(target_modules=["conv"], r=old_rank, init_lora_weights=False) + model = get_peft_model(model, config).eval() + with torch.inference_mode(): + output_before = model(inputs) + + # sanity check: LoRA changed output + assert not torch.allclose(output_base, output_before) + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + with torch.inference_mode(): + output_after = model(inputs) + + assert torch.allclose(output_before, output_after) + + def test_prepare_model_for_compiled_hotswap_scalings_update_config(self): + old_rank0 = 11 + old_rank1 = 13 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank0, rank_pattern={"lin1": old_rank1}) + model = self.get_model() + model = get_peft_model(model, config) + + new_rank = 15 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank, config=model.peft_config) + + assert model.peft_config["default"].r == new_rank + assert model.peft_config["default"].rank_pattern == {"lin1": new_rank} + + def test_prepare_model_for_compiled_hotswap_lora_bias(self): + # When setting lora_bias=True in the LoraConfig, the LoRA B parameter will have a bias term. Check that padding + # still works correctly. Note that the LoRA A parameter still won't have a bias term. + old_rank = 8 + config = LoraConfig(target_modules=["lin0", "lin1"], r=old_rank, lora_bias=True) + model = self.get_model() + model = get_peft_model(model, config) + + # sanity check + for name, param in model.named_parameters(): + if "lora_A" in name and name.endswith(".weight"): + assert param.shape[0] == old_rank + elif "lora_B" in name and name.endswith(".weight"): + assert param.shape[1] == old_rank + elif "lora_A" in name and name.endswith(".bias"): + assert False, "LoRA A should not have a bias term" + elif "lora_B" in name and name.endswith(".bias"): + assert param.shape[0] in (5, 20) # output shapes of the 2 layers + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name and name.endswith(".weight"): + assert param.shape[0] == new_rank + elif "lora_B" in name and name.endswith(".weight"): + assert param.shape[1] == new_rank + elif "lora_A" in name and name.endswith(".bias"): + assert False, "LoRA A should not have a bias term" + elif "lora_B" in name and name.endswith(".bias"): + assert param.shape[0] in (5, 20) # output shapes of the 2 layers + + def test_prepare_model_for_compiled_hotswap_conv2d_lora_bias(self): + # same as previous test, but for a Conv2d model + old_rank = 8 + config = LoraConfig(target_modules=["conv"], r=old_rank, lora_bias=True) + model = self.get_model_conv2d() + model = get_peft_model(model, config) + + # sanity check + for name, param in model.named_parameters(): + if "lora_A" in name and name.endswith(".weight"): + assert param.shape[0] == old_rank + elif "lora_B" in name and name.endswith(".weight"): + assert param.shape[1] == old_rank + elif "lora_A" in name and name.endswith(".bias"): + assert False, "LoRA A should not have a bias term" + elif "lora_B" in name and name.endswith(".bias"): + assert param.shape[0] == 10 # output shape of conv layer + + new_rank = 13 + prepare_model_for_compiled_hotswap(model, target_rank=new_rank) + + for name, param in model.named_parameters(): + if "lora_A" in name and name.endswith(".weight"): + assert param.shape[0] == new_rank + elif "lora_B" in name and name.endswith(".weight"): + assert param.shape[1] == new_rank + elif "lora_A" in name and name.endswith(".bias"): + assert False, "LoRA A should not have a bias term" + elif "lora_B" in name and name.endswith(".bias"): + assert param.shape[0] == 10 # output shape of conv layer + + +def test_import_peft_type_to_model_mapping_deprecation_warning(recwarn): + # This is for backwards compatibility: In #2282, PEFT_TYPE_TO_MODEL_MAPPING was removed as it was redundant with + # PEFT_TYPE_TO_TUNER_MAPPING. However, third party code could still use this mapping, e.g.: + # https://github.com/AutoGPTQ/AutoGPTQ/blob/6689349625de973b9ee3016c28c11f32acf7f02c/auto_gptq/utils/peft_utils.py#L8 + # TODO: Remove after 2026-01 + + # first check that there is no warning under normal circumstances + from peft.peft_model import PeftModel # noqa + + expected = ( + "PEFT_TYPE_TO_MODEL_MAPPING is deprecated, please use `from peft import PEFT_TYPE_TO_TUNER_MAPPING` instead" + ) + warnings = (w.message.args[0] for w in recwarn.list) + assert not any(w.startswith(expected) for w in warnings) + + from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING # noqa + + # check that there is a warning with this message after importing the variable + warnings = (w.message.args[0] for w in recwarn.list) + assert any(w.startswith(expected) for w in warnings) + + +class TestScaling: + """Tests for scaling and unscaling + + Those methods are currently only implemented for LoRA and were added for use in diffusers. + """ + + @pytest.fixture + def model(self): + # tiny opt with 5 attention layers + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + return AutoModelForCausalLM.from_pretrained(model_id) + + def get_scalings(self, model, adapter_name="default"): + # helper function, returns the scalings of the 5 attention layers + return [m.scaling[adapter_name] for m in model.modules() if isinstance(m, LoraLayer)] + + def set_scale(self, model, adapter_name, scale): + for module in model.modules(): + if isinstance(module, LoraLayer): + module.set_scale(adapter_name, scale) + + def scale_layer(self, model, scale): + for module in model.modules(): + if isinstance(module, LoraLayer): + module.scale_layer(scale) + + def unscale_layer(self, model, scale): + for module in model.modules(): + if isinstance(module, LoraLayer): + module.unscale_layer(scale) + + def test_scaling_simple(self, model): + n_layers = 5 + rank, lora_alpha = 8, 16 + config = LoraConfig( + r=rank, + lora_alpha=lora_alpha, + target_modules=["k_proj"], + ) + model = get_peft_model(model, config) + scalings = self.get_scalings(model) + expected = [lora_alpha / rank] * n_layers + assert scalings == expected + + # double + self.scale_layer(model, 2) + scalings = self.get_scalings(model) + expected = [4.0] * n_layers + assert scalings == expected + + # back to original + self.unscale_layer(model, None) + scalings = self.get_scalings(model) + expected = [2.0] * n_layers + assert scalings == expected + + # triple + self.set_scale(model, "default", 3) + scalings = self.get_scalings(model) + expected = [6.0] * n_layers + assert scalings == expected + + # back to original + self.unscale_layer(model, 3) + scalings = self.get_scalings(model) + expected = [2.0] * n_layers + assert scalings == expected + + def test_scaling_with_rslora(self, model): + n_layers = 5 + rank, lora_alpha = 8, 16 + config = LoraConfig( + r=rank, + lora_alpha=lora_alpha, + use_rslora=True, + target_modules=["k_proj"], + ) + model = get_peft_model(model, config) + scalings = self.get_scalings(model) + expected = [lora_alpha / math.sqrt(rank)] * n_layers + assert scalings == expected + + # double + self.scale_layer(model, 2) + scalings = self.get_scalings(model) + expected = [2 * lora_alpha / math.sqrt(rank)] * n_layers + assert scalings == expected + + # back to original + self.unscale_layer(model, None) + scalings = self.get_scalings(model) + expected = [lora_alpha / math.sqrt(rank)] * n_layers + assert scalings == expected + + # triple + self.set_scale(model, "default", 3) + scalings = self.get_scalings(model) + expected = [3 * lora_alpha / math.sqrt(rank)] * n_layers + assert scalings == expected + + # back to original + self.unscale_layer(model, 3) + scalings = self.get_scalings(model) + expected = [lora_alpha / math.sqrt(rank)] * n_layers + assert scalings == expected + + def test_scaling_rank_pattern_alpha_pattern(self, model): + # layer 0: 8 / 8 + # layer 1: 8 / 16 + # layer 2: 4 / 32 + # layer 3: 16 / 8 + # layer 4: 8 / 8 + config = LoraConfig( + r=8, + lora_alpha=8, + target_modules=["k_proj"], + rank_pattern={"layers.1.self_attn.k_proj": 16, "layers.2.self_attn.k_proj": 32}, + alpha_pattern={"layers.2.self_attn.k_proj": 4, "layers.3.self_attn.k_proj": 16}, + ) + model = get_peft_model(model, config) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + # double + self.scale_layer(model, 2) + scalings = self.get_scalings(model) + expected = [2.0, 1.0, 0.25, 4.0, 2.0] + assert scalings == expected + + # back to original + self.unscale_layer(model, None) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + # triple + self.set_scale(model, "default", 3) + scalings = self.get_scalings(model) + expected = [3.0, 1.5, 0.375, 6.0, 3.0] + assert scalings == expected + + # back to original + self.unscale_layer(model, 3) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + def test_scaling_multiple_times(self, model): + # same as previous test, but scale and unscale multiple times in a row + # layer 0: 8 / 8 + # layer 1: 8 / 16 + # layer 2: 4 / 32 + # layer 3: 16 / 8 + # layer 4: 8 / 8 + config = LoraConfig( + r=8, + lora_alpha=8, + target_modules=["k_proj"], + rank_pattern={"layers.1.self_attn.k_proj": 16, "layers.2.self_attn.k_proj": 32}, + alpha_pattern={"layers.2.self_attn.k_proj": 4, "layers.3.self_attn.k_proj": 16}, + ) + model = get_peft_model(model, config) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + # scale of 1 makes no difference + self.scale_layer(model, 1) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + + # double + self.scale_layer(model, 2) + scalings = self.get_scalings(model) + expected = [2.0, 1.0, 0.25, 4.0, 2.0] + assert scalings == expected + + # triple, on top of previous double + self.scale_layer(model, 3) + scalings = self.get_scalings(model) + expected = [6.0, 3.0, 0.75, 12.0, 6.0] + assert scalings == expected + + # half + self.unscale_layer(model, 2) + scalings = self.get_scalings(model) + expected = [3.0, 1.5, 0.375, 6.0, 3.0] + assert scalings == expected + + # divide by 3, on top of previous half + self.unscale_layer(model, 3) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + # set scale to 2 + self.set_scale(model, "default", 2) + scalings = self.get_scalings(model) + expected = [2.0, 1.0, 0.25, 4.0, 2.0] + assert scalings == expected + + # set scale to 3, it is cumulative but based on the initial scaling, so factor 3, not 6 + self.set_scale(model, "default", 3) + scalings = self.get_scalings(model) + expected = [3.0, 1.5, 0.375, 6.0, 3.0] + assert scalings == expected + + # back to original + self.unscale_layer(model, None) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + # back to original again + self.unscale_layer(model, None) + scalings = self.get_scalings(model) + expected = [1.0, 0.5, 0.125, 2.0, 1.0] + assert scalings == expected + + def test_scaling_multiple_adapters(self, model): + # ensure that scaling works with multiple adapters + n_layers = 5 + rank0, lora_alpha0 = 8, 16 + config0 = LoraConfig( + r=rank0, + lora_alpha=lora_alpha0, + target_modules=["k_proj"], + ) + rank1, lora_alpha1 = 16, 8 + config1 = LoraConfig( + r=rank1, + lora_alpha=lora_alpha1, + target_modules=["k_proj"], + ) + model = get_peft_model(model, config0) + model.add_adapter("other", config1) + + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [lora_alpha0 / rank0] * n_layers + expected_other = [lora_alpha1 / rank1] * n_layers + assert scalings_default == expected_default + assert scalings_other == expected_other + + # double the scale for other + self.set_scale(model, "other", 2) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [lora_alpha0 / rank0] * n_layers + expected_other = [2 * lora_alpha1 / rank1] * n_layers + assert scalings_default == expected_default + assert scalings_other == expected_other + + # quarter the scale for default + self.set_scale(model, "default", 0.25) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [lora_alpha0 / rank0 / 4] * n_layers + expected_other = [2 * lora_alpha1 / rank1] * n_layers + assert scalings_default == expected_default + assert scalings_other == expected_other + + # unscale resets for all *active* adapters + self.unscale_layer(model, None) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [lora_alpha0 / rank0] * n_layers + expected_other = [2 * lora_alpha1 / rank1] * n_layers # stays the same as 'other' is not active + assert scalings_default == expected_default + assert scalings_other == expected_other + + # scale all *active* adapters by 2 + self.scale_layer(model, 2) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [2 * lora_alpha0 / rank0] * n_layers + expected_other = [2 * lora_alpha1 / rank1] * n_layers # stays the same as 'other' is not active + assert scalings_default == expected_default + assert scalings_other == expected_other + + # switch to 'other' + model.set_adapter("other") + + # unscale, this time 'other' + self.unscale_layer(model, None) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [2 * lora_alpha0 / rank0] * n_layers # stays the same as 'other' is not active + expected_other = [lora_alpha1 / rank1] * n_layers + assert scalings_default == expected_default + assert scalings_other == expected_other + + # scale all *active* adapters by 3 + self.scale_layer(model, 3) + scalings_default = self.get_scalings(model, "default") + scalings_other = self.get_scalings(model, "other") + expected_default = [2 * lora_alpha0 / rank0] * n_layers # stays the same as 'other' is not active + expected_other = [3 * lora_alpha1 / rank1] * n_layers + assert scalings_default == expected_default + assert scalings_other == expected_other + + +class TestLoadPeftKeyMapping: + # See discussion in https://github.com/huggingface/transformers/pull/38627 + + # transformers PR #37033 re-arranges the way visual language models are built by moving the LM head from the + # language model to the top-level VLM (among other things). A consequence of this is that the keys in the PEFT + # state_dict now also follow the new architecture. This test class serves to ensure that old checkpoints can be + # loaded with the changed architecture. Unfortunately, new checkpoints cannot be loaded with the old architecture, + # the corresponding test is marked as xfail. + + # Note: We only test prefix tuning (prompt learning method), LoRA (non-prompt learning method), and VBLoRA (shared + # parameters) as the other PEFT methods should work the same way. It would be excessive to test all of them here. + + @pytest.fixture + def fake_model_config(self): + # mimics a transformers model config + class FakeConfig(dict): + def __init__(self): + self.vocab_size = 10 + + def __getattr__(self, item): + if item in self: + return self[item] + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{item}'") + + return FakeConfig() + + @pytest.fixture + def old_model(self, fake_model_config): + # create a small model that mimics the old architecture of, for instance, Qwen/Qwen2-VL-2B-Instruct + # Qwen2VLForConditionalGeneration( + # (visual): Qwen2VisionTransformerPretrainedModel( + # (patch_embed): PatchEmbed( + # (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) + # ) + # (rotary_pos_emb): VisionRotaryEmbedding() + # (blocks): ModuleList( + # (0-31): 32 x Qwen2VLVisionBlock( + # (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (attn): VisionSdpaAttention( + # (qkv): Linear(in_features=1280, out_features=3840, bias=True) + # (proj): Linear(in_features=1280, out_features=1280, bias=True) + # ) + # (mlp): VisionMlp( + # (fc1): Linear(in_features=1280, out_features=5120, bias=True) + # (act): QuickGELUActivation() + # (fc2): Linear(in_features=5120, out_features=1280, bias=True) + # ) + # ) + # ) + # (merger): PatchMerger( + # (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (mlp): Sequential( + # (0): Linear(in_features=5120, out_features=5120, bias=True) + # (1): GELU(approximate='none') + # (2): Linear(in_features=5120, out_features=1536, bias=True) + # ) + # ) + # ) + # (model): Qwen2VLModel( + # (embed_tokens): Embedding(151936, 1536) + # (layers): ModuleList( + # (0-27): 28 x Qwen2VLDecoderLayer( + # (self_attn): Qwen2VLSdpaAttention( + # (q_proj): Linear(in_features=1536, out_features=1536, bias=True) + # (k_proj): Linear(in_features=1536, out_features=256, bias=True) + # (v_proj): Linear(in_features=1536, out_features=256, bias=True) + # (o_proj): Linear(in_features=1536, out_features=1536, bias=False) + # (rotary_emb): Qwen2VLRotaryEmbedding() + # ) + # (mlp): Qwen2MLP( + # (gate_proj): Linear(in_features=1536, out_features=8960, bias=False) + # (up_proj): Linear(in_features=1536, out_features=8960, bias=False) + # (down_proj): Linear(in_features=8960, out_features=1536, bias=False) + # (act_fn): SiLU() + # ) + # (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06) + # (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06) + # ) + # ) + # (norm): Qwen2RMSNorm((1536,), eps=1e-06) + # (rotary_emb): Qwen2VLRotaryEmbedding() + # ) + # (lm_head): Linear(in_features=1536, out_features=151936, bias=False) + # ) + class Block(nn.Module): + def __init__(self): + super().__init__() + self.attn = nn.Linear(10, 10) + + class OldModel(nn.Module): + def __init__(self): + super().__init__() + self.config = fake_model_config + self.device = "cpu" + self.proj = nn.Conv3d(3, 10, 3) + self.visual = nn.ModuleDict( + { + "blocks": nn.ModuleList([Block() for _ in range(2)]), + } + ) + self.model = nn.ModuleDict( + { + "layers": nn.ModuleList([Block() for _ in range(2)]), + } + ) + self.lm_head = nn.Linear(10, 10) + + def prepare_inputs_for_generation(self): + return + + model = OldModel() + return model + + @pytest.fixture + def new_model(self, fake_model_config): + # create a small model that mimics the new architecture of, for instance, Qwen/Qwen2-VL-2B-Instruct + # Qwen2VLForConditionalGeneration( + # (model): Qwen2VLModel( + # (visual): Qwen2VisionTransformerPretrainedModel( + # (patch_embed): PatchEmbed( + # (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) + # ) + # (rotary_pos_emb): VisionRotaryEmbedding() + # (blocks): ModuleList( + # (0-31): 32 x Qwen2VLVisionBlock( + # (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (attn): VisionSdpaAttention( + # (qkv): Linear(in_features=1280, out_features=3840, bias=True) + # (proj): Linear(in_features=1280, out_features=1280, bias=True) + # ) + # (mlp): VisionMlp( + # (fc1): Linear(in_features=1280, out_features=5120, bias=True) + # (act): QuickGELUActivation() + # (fc2): Linear(in_features=5120, out_features=1280, bias=True) + # ) + # ) + # ) + # (merger): PatchMerger( + # (ln_q): LayerNorm((1280,), eps=1e-06, elementwise_affine=True) + # (mlp): Sequential( + # (0): Linear(in_features=5120, out_features=5120, bias=True) + # (1): GELU(approximate='none') + # (2): Linear(in_features=5120, out_features=1536, bias=True) + # ) + # ) + # ) + # (language_model): Qwen2VLTextModel( + # (embed_tokens): Embedding(151936, 1536) + # (layers): ModuleList( + # (0-27): 28 x Qwen2VLDecoderLayer( + # (self_attn): Qwen2VLAttention( + # (q_proj): Linear(in_features=1536, out_features=1536, bias=True) + # (k_proj): Linear(in_features=1536, out_features=256, bias=True) + # (v_proj): Linear(in_features=1536, out_features=256, bias=True) + # (o_proj): Linear(in_features=1536, out_features=1536, bias=False) + # (rotary_emb): Qwen2VLRotaryEmbedding() + # ) + # (mlp): Qwen2MLP( + # (gate_proj): Linear(in_features=1536, out_features=8960, bias=False) + # (up_proj): Linear(in_features=1536, out_features=8960, bias=False) + # (down_proj): Linear(in_features=8960, out_features=1536, bias=False) + # (act_fn): SiLU() + # ) + # (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06) + # (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06) + # ) + # ) + # (norm): Qwen2RMSNorm((1536,), eps=1e-06) + # (rotary_emb): Qwen2VLRotaryEmbedding() + # ) + # ) + # (lm_head): Linear(in_features=1536, out_features=151936, bias=False) + # ) + class Block(nn.Module): + def __init__(self): + super().__init__() + self.attn = nn.Linear(10, 10) + + class InnerModel(nn.Module): + def __init__(self): + super().__init__() + self.visual = nn.ModuleDict( + { + "blocks": nn.ModuleList([Block() for _ in range(2)]), + } + ) + self.language_model = nn.ModuleDict( + { + "layers": nn.ModuleList([Block() for _ in range(2)]), + } + ) + + class NewModel(nn.Module): + def __init__(self): + super().__init__() + self.config = fake_model_config + self.device = "cpu" + self.model = InnerModel() + self.lm_head = nn.Linear(10, 10) + # new transformers models have this attribute to map old checkpoints to new ones: + self._checkpoint_conversion_mapping = { + "^visual": "model.visual", + "^model(?!\\.(language_model|visual))": "model.language_model", + } + + def prepare_inputs_for_generation(self): + return + + model = NewModel() + return model + + def check_lora_load_no_warning(self, model1, model2, path): + # helper method: save with model1, load with model2, ensure that there is no warning about missing keys and that + # the parameters are loaded correctly + model1 = copy.deepcopy(model1) + model2 = copy.deepcopy(model2) + config = LoraConfig(target_modules=["attn"]) + peft_model = get_peft_model(copy.deepcopy(model1), config) + + # set all values to 1.0 or 2.0 so we can check that they are loaded correctly + for name, param in peft_model.named_parameters(): + if name.endswith("lora_A.default.weight"): + param.data.fill_(1.0) + elif name.endswith("lora_B.default.weight"): + param.data.fill_(2.0) + + peft_model.save_pretrained(path) + del peft_model + + # ensure that there is no warning: UserWarning: Found missing adapter keys while loading the checkpoint + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + loaded = PeftModel.from_pretrained(copy.deepcopy(model2), path) + assert not any("Found missing adapter keys" in str(warning.message) for warning in w) + + # sanity check on parameter values to not only rely on the absence of warnings + for name, param in loaded.named_parameters(): + if name.endswith("lora_A.default.weight"): + assert torch.allclose(param, torch.full_like(param, 1.0)) + elif name.endswith("lora_B.default.weight"): + assert torch.allclose(param, torch.full_like(param, 2.0)) + + def check_prefix_tuning_load_no_warning(self, model1, model2, path): + # helper method: save with model1, load with model2, ensure that there is no warning about missing keys and that + # the parameters are loaded correctly. + model1 = copy.deepcopy(model1) + model2 = copy.deepcopy(model2) + config = PrefixTuningConfig( + task_type="CAUSAL_LM", num_virtual_tokens=5, num_layers=2, token_dim=10, num_attention_heads=2 + ) + peft_model = get_peft_model(copy.deepcopy(model1), config) + + # set all values to 1.0 so we can check that they are loaded correctly + peft_model.prompt_encoder.default.embedding.weight.data.fill_(1.0) + + peft_model.save_pretrained(path) + del peft_model + + # ensure that there is no warning: UserWarning: Found missing adapter keys while loading the checkpoint + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + loaded = PeftModel.from_pretrained(copy.deepcopy(model2), path) + assert not any("Found missing adapter keys" in str(warning.message) for warning in w) + + # sanity check on parameter values to not only rely on the absence of warnings + weight = loaded.prompt_encoder.default.embedding.weight + assert torch.allclose(weight, torch.full_like(weight, 1.0)) + + def check_vblora_load_no_warning(self, model1, model2, path): + # helper method: save with model1, load with model2, ensure that there is no warning about missing keys and that + # the parameters are loaded correctly + model1 = copy.deepcopy(model1) + model2 = copy.deepcopy(model2) + + config = VBLoRAConfig(target_modules=["attn"], vector_length=2, num_vectors=4) + peft_model = get_peft_model(copy.deepcopy(model1), config) + + # set all values to 1.0 or 2.0 so we can check that they are loaded correctly + peft_model.base_model.vblora_vector_bank["default"].data.fill_(1.0) + for name, param in peft_model.named_parameters(): + if "logits" in name: + param.data.fill_(2.0) + + peft_model.save_pretrained(path) + del peft_model + + # ensure that there is no warning: UserWarning: Found missing adapter keys while loading the checkpoint + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + loaded = PeftModel.from_pretrained(copy.deepcopy(model2), path) + assert not any("Found missing adapter keys" in str(warning.message) for warning in w) + + # sanity check on parameter values to not only rely on the absence of warnings + param = loaded.base_model.vblora_vector_bank["default"] + assert torch.allclose(param, torch.full_like(param, 1.0)) + for name, param in loaded.named_parameters(): + if "logits" in name: + assert torch.allclose(param, torch.full_like(param, 2.0)) + + def test_key_mapping_save_new_load_new_lora(self, new_model, tmp_path): + # save and load the new model, should work without issues + self.check_lora_load_no_warning(new_model, new_model, tmp_path) + + def test_key_mapping_save_old_load_old_lora(self, old_model, tmp_path): + # save and load the old model, should work without issues + self.check_lora_load_no_warning(old_model, old_model, tmp_path) + + def test_key_mapping_save_old_load_new_lora(self, old_model, new_model, tmp_path): + # save the old model, load it into the new model, should work without issues (backwards compatibility) + self.check_lora_load_no_warning(old_model, new_model, tmp_path) + + @pytest.mark.xfail(reason="Loading new checkpoints with old transformers is not supported.", strict=True) + def test_key_mapping_save_new_load_old_lora(self, old_model, new_model, tmp_path): + # save the new model, load it into the old model, should work without issues (forwards compatibility) + self.check_lora_load_no_warning(new_model, old_model, tmp_path) + + def test_key_mapping_save_new_load_new_prefix_tuning(self, new_model, tmp_path): + # save and load the new model, should work without issues + self.check_prefix_tuning_load_no_warning(new_model, new_model, tmp_path) + + def test_key_mapping_save_old_load_old_prefix_tuning(self, old_model, tmp_path): + # save and load the old model, should work without issues + self.check_prefix_tuning_load_no_warning(old_model, old_model, tmp_path) + + def test_key_mapping_save_old_load_new_prefix_tuning(self, old_model, new_model, tmp_path): + # save the old model, load it into the new model, should work without issues (backwards compatibility) + self.check_prefix_tuning_load_no_warning(old_model, new_model, tmp_path) + + def test_key_mapping_save_new_load_old_prefix_tuning(self, old_model, new_model, tmp_path): + # save the new model, load it into the old model, should work without issues (forwards compatibility) + self.check_prefix_tuning_load_no_warning(new_model, old_model, tmp_path) + + def test_key_mapping_save_new_load_new_vblora(self, new_model, tmp_path): + # save and load the new model, should work without issues + self.check_vblora_load_no_warning(new_model, new_model, tmp_path) + + def test_key_mapping_save_old_load_old_vblora(self, old_model, tmp_path): + # save and load the old model, should work without issues + self.check_vblora_load_no_warning(old_model, old_model, tmp_path) + + def test_key_mapping_save_old_load_new_vblora(self, old_model, new_model, tmp_path): + # save the old model, load it into the new model, should work without issues (backwards compatibility) + self.check_vblora_load_no_warning(old_model, new_model, tmp_path) + + @pytest.mark.xfail(reason="Loading new checkpoints with old transformers is not supported.", strict=True) + def test_key_mapping_save_new_load_old_vblora(self, old_model, new_model, tmp_path): + # save the new model, load it into the old model, should work without issues (forwards compatibility) + self.check_vblora_load_no_warning(new_model, old_model, tmp_path) diff --git a/peft/tests/test_integrations.py b/peft/tests/test_integrations.py new file mode 100644 index 0000000000000000000000000000000000000000..18ce4b2f0625424f566b93de1d6accaf4b87ced1 --- /dev/null +++ b/peft/tests/test_integrations.py @@ -0,0 +1,97 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +from torch import nn + +from peft.utils.integrations import init_empty_weights, skip_init_on_device + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = nn.Linear(20, 2, bias=bias) + + +def get_mlp(): + return MLP() + + +class TestInitEmptyWeights: + def test_init_empty_weights_works(self): + # this is a very rudimentary test, as init_empty_weights is copied almost 1:1 from accelerate and is tested + # there + with init_empty_weights(): + mlp = get_mlp() + + expected = torch.device("meta") + assert all(p.device == expected for p in mlp.parameters()) + + def test_skip_init_on_device_works(self): + # when a function is decorated with skip_init_on_device, the parameters are not moved to meta device, even when + # inside the context + decorated_fn = skip_init_on_device(get_mlp) + with init_empty_weights(): + mlp = decorated_fn() + + expected = torch.device("cpu") + assert all(p.device == expected for p in mlp.parameters()) + + def test_skip_init_on_device_works_outside_context(self): + # same as before, but ensure that skip_init_on_device does not break when no init_empty_weights context is used + decorated_fn = skip_init_on_device(get_mlp) + mlp = decorated_fn() + expected = torch.device("cpu") + assert all(p.device == expected for p in mlp.parameters()) + + def test_skip_init_on_device_not_permanent(self): + # ensure that after skip_init_on_device has been used, init_empty_weights reverts to its original functionality + + # with decorator => cpu + decorated_fn = skip_init_on_device(get_mlp) + with init_empty_weights(): + mlp = decorated_fn() + + expected = torch.device("cpu") + assert all(p.device == expected for p in mlp.parameters()) + + # without decorator => meta + with init_empty_weights(): + mlp = get_mlp() + + expected = torch.device("meta") + assert all(p.device == expected for p in mlp.parameters()) + + def test_skip_init_on_device_nested(self): + # ensure that skip_init_on_device works even if the decorated function is nested inside another decorated + # function + @skip_init_on_device + def outer_fn(): + @skip_init_on_device + def inner_fn(): + return get_mlp() + + mlp0 = inner_fn() + mlp1 = get_mlp() + return mlp0, mlp1 + + with init_empty_weights(): + mlp0, mlp1 = outer_fn() + + expected = torch.device("cpu") + assert all(p.device == expected for p in mlp0.parameters()) + assert all(p.device == expected for p in mlp1.parameters()) diff --git a/peft/tests/test_lora_megatron.py b/peft/tests/test_lora_megatron.py new file mode 100644 index 0000000000000000000000000000000000000000..ff91a41387d768d8741e2568b98ae405aff47778 --- /dev/null +++ b/peft/tests/test_lora_megatron.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import importlib +import os +import unittest + +import torch +import torch.nn.init as init + +from peft import LoraConfig, PeftModel, get_peft_model, get_peft_model_state_dict + +from .testing_utils import require_torch_gpu + + +def is_megatron_available() -> bool: + return importlib.util.find_spec("megatron") is not None + + +if is_megatron_available(): + from megatron.core import parallel_state, tensor_parallel + from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed + from megatron.core.transformer.module import MegatronModule + from megatron.core.transformer.transformer_config import TransformerConfig + + world_size = 1 + rank = 0 + + def initialize_distributed(): + print(f"Initializing torch.distributed with rank: {rank}, world_size: {world_size}") + torch.cuda.set_device(0) + init_method = "tcp://" + master_ip = os.getenv("MASTER_ADDR", "localhost") + master_port = os.getenv("MASTER_PORT", "6001") + init_method += master_ip + ":" + master_port + torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=rank, init_method=init_method) + + def destroy_model_parallel(): + parallel_state.destroy_model_parallel() + torch.distributed.barrier() + + def initialize_model_parallel( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + virtual_pipeline_model_parallel_size=None, + pipeline_model_parallel_split_rank=None, + ): + parallel_state.destroy_model_parallel() + if not torch.distributed.is_initialized(): + initialize_distributed() + parallel_state.initialize_model_parallel( + tensor_model_parallel_size, + pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size, + pipeline_model_parallel_split_rank, + ) + + class DummyModule(MegatronModule): + def __init__(self, config: TransformerConfig): + super().__init__(config) + self.linear = tensor_parallel.ColumnParallelLinear( + input_size=10, + output_size=10, + config=config, + init_method=init.xavier_normal_, + bias=False, + gather_output=False, + ) + self.lm_head = tensor_parallel.RowParallelLinear( + input_size=10, + output_size=10, + config=config, + init_method=init.xavier_normal_, + bias=False, + input_is_parallel=True, + skip_bias_add=True, + ) + + def forward(self, input): + x = self.linear(input)[0] + x = self.lm_head(x)[0] + return x + + @require_torch_gpu + class TestMegatronLora(unittest.TestCase): + def setUp(self): + initialize_model_parallel(1, 1) + model_parallel_cuda_manual_seed(123) + transformer_config = { + "num_layers": 2, + "hidden_size": 12, + "num_attention_heads": 4, + "use_cpu_initialization": True, + } + config = TransformerConfig(**transformer_config) + self.megatron_module = DummyModule(config=config).cuda() + self.dummy_module = copy.deepcopy(self.megatron_module).cuda() + + lora_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + target_modules=["linear", "lm_head"], + megatron_config=config, + megatron_core="megatron.core", + ) + self.megatron_module = get_peft_model(self.megatron_module, lora_config) + + def tearDown(self): + destroy_model_parallel() + + def test_megatron_lora_module(self): + megatron_module = self.megatron_module + assert isinstance(megatron_module, PeftModel) + + for name, module in megatron_module.named_modules(): + if name.endswith("linear"): + assert hasattr(module, "lora_A") + assert hasattr(module, "lora_B") + if name.endswith("linear.lora_A.default"): + assert isinstance(module, torch.nn.Linear) + if name.endswith("linear.lora_B.default"): + assert isinstance(module, tensor_parallel.ColumnParallelLinear) + + if name.endswith("lm_head.lora_A.default"): + assert isinstance(module, tensor_parallel.RowParallelLinear) + if name.endswith("lm_head.lora_B.default"): + assert isinstance(module, torch.nn.Linear) + + def test_forward(self): + x = torch.ones((2, 4, 10)).cuda() + megatron_module_result = self.megatron_module(x) + dummt_module_result = self.dummy_module(x) + + # Because lora_B is initialized with 0, the forward results of two models should be equal before backward. + assert megatron_module_result.equal(dummt_module_result) + + def test_backward(self): + optimizer = torch.optim.AdamW(self.megatron_module.parameters()) + loss_fn = torch.nn.CrossEntropyLoss() + + x = torch.randn(2, 4, 10, requires_grad=True).cuda() + label = torch.randint(10, (2 * 4,)).cuda() + + output = self.megatron_module(x) + output = output.reshape(2 * 4, 10) + loss = loss_fn(output, label) + + loss.backward() + optimizer.step() + + def test_get_peft_model_state_dict(self): + peft_state_dict = get_peft_model_state_dict(self.megatron_module) + + for key in peft_state_dict.keys(): + assert "lora" in key diff --git a/peft/tests/test_lora_variants.py b/peft/tests/test_lora_variants.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2a3c20a29264b83cea3fe22d9f630b6b190f2a --- /dev/null +++ b/peft/tests/test_lora_variants.py @@ -0,0 +1,267 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch +from torch import nn + +from peft import LoraConfig, get_peft_model +from peft.tuners.lora.layer import Conv1d as LoraConv1d +from peft.tuners.lora.layer import Conv2d as LoraConv2d +from peft.tuners.lora.layer import Embedding as LoraEmbedding +from peft.tuners.lora.layer import Linear as LoraLinear +from peft.tuners.lora.variants import ( + ALoraLinearVariant, + DoraConv1dVariant, + DoraConv2dVariant, + DoraEmbeddingVariant, + DoraLinearVariant, + calculate_alora_offsets, + get_alora_offsets_for_forward, + get_alora_offsets_for_generate, +) + + +# Custom model featuring embeddings and a 'visual stack' +class CustomModel(nn.Module): + """pytorch module that contains common targetable layers (linear, embedding, conv, ...)""" + + def __init__(self, num_embeddings=100, embedding_dim=16, num_classes=10): + super().__init__() + self.embedding = nn.Embedding(num_embeddings, embedding_dim) + self.conv1d = nn.Conv1d(in_channels=embedding_dim, out_channels=32, kernel_size=3, padding=1) + self.conv2d = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1) + self.flatten = nn.Flatten() + self.dummy_conv1d_output_dim = 32 * 10 + self.dummy_conv2d_output_dim = 16 * 10 * 10 + self.linear1 = nn.Linear(self.dummy_conv1d_output_dim + self.dummy_conv2d_output_dim, 64) + self.linear2 = nn.Linear(64, num_classes) + self.relu = nn.ReLU() + + def forward(self, input_ids, dummy_image_input): + # Path 1: Embedding -> Conv1d + x1 = self.embedding(input_ids) # (batch_size, seq_len, embedding_dim) + x1 = x1.transpose(1, 2) # (batch_size, embedding_dim, seq_len) + x1 = self.relu(self.conv1d(x1)) # (batch_size, 32, seq_len) + x1_flat = self.flatten(x1) + # Path 2: Conv2d -> Linear + x2 = self.relu(self.conv2d(dummy_image_input)) # (batch_size, 16, H, W) + x2_flat = self.flatten(x2) # (batch_size, 16*H*W) + # Combine or select paths if making a functional model. + # For this test, we mainly care about layer types, so forward might not be fully executed. + # Let's use x2_flat for subsequent linear layers. + output = self.relu(self.linear1(torch.concat([x1_flat, x2_flat], dim=1))) + output = self.linear2(output) + return output + + +# Used for testing alora_offsets for aLoRA +class DummyLM(nn.Module): + def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): + super().__init__() + self.embed = nn.Embedding(vocab_size, hidden_dim) + self.linear = nn.Linear(hidden_dim, vocab_size) + + def forward(self, X=None, embeds=None, num_beams=None, alora_offsets=None): + if X is not None: + embeds = self.embed(X) + return self.linear(embeds) + + +class MockTransformerWrapper: + """Mock class to behave like a transformers model. + + This is needed because the tests initialize the model by calling transformers_class.from_pretrained. + + """ + + @classmethod + def from_pretrained(cls): + # set the seed so that from_pretrained always returns the same model + torch.manual_seed(0) + + torch_dtype = torch.float32 + + return DummyLM().to(torch_dtype) + + +VARIANT_MAP = { + "dora": { + LoraLinear: DoraLinearVariant, + LoraEmbedding: DoraEmbeddingVariant, + LoraConv1d: DoraConv1dVariant, + LoraConv2d: DoraConv2dVariant, + }, + "alora": { + LoraLinear: ALoraLinearVariant, + }, +} + + +TEST_CASES = [ + ( + "dora", + LoraConfig, + {"target_modules": ["linear1", "linear2", "conv1d", "conv2d", "embedding"], "use_dora": True}, + ), + ( + "alora", + LoraConfig, + {"target_modules": ["linear1", "linear2"], "alora_invocation_tokens": [1]}, + ), +] + + +class TestLoraVariants: + @pytest.mark.parametrize("variant_name, config_cls, config_kwargs", TEST_CASES) + def test_variant_is_applied_to_layers(self, variant_name, config_cls, config_kwargs): + # This test assumes that targeting and replacing layers works and that after `get_peft_model` we + # have a model with LoRA layers. We just make sure that each LoRA layer has its variant set and + # it is also the correct variant for that layer. + base_model = CustomModel() + peft_config = config_cls(**config_kwargs) + peft_model = get_peft_model(base_model, peft_config) + + layer_type_map = VARIANT_MAP[variant_name] + + for _, module in peft_model.named_modules(): + if not hasattr(module, "lora_variant"): + continue + + # Note that not every variant supports every layer. If it is not mapped it is deemed unsupported and + # will not be tested. + expected_variant_type = layer_type_map.get(type(module), None) + if not expected_variant_type: + continue + + assert isinstance(module.lora_variant["default"], expected_variant_type) + + def custom_model_with_loss_backpropagated(self, peft_config): + """Returns the CustomModel + PEFT model instance with a dummy loss that was backpropagated once.""" + base_model = CustomModel() + peft_model = get_peft_model(base_model, peft_config) + + x, y = torch.ones(10, 10).long(), torch.ones(10, 1, 10, 10) + out = peft_model(x, y) + loss = out.sum() + loss.backward() + + return base_model, peft_model + + def test_dora_params_have_gradients(self): + """Ensure that the parameters added by the DoRA variant are participating in the output computation.""" + layer_names = ["linear1", "linear2", "conv1d", "conv2d", "embedding"] + peft_config = LoraConfig(target_modules=layer_names, use_dora=True) + base_model, peft_model = self.custom_model_with_loss_backpropagated(peft_config) + + for layer in layer_names: + assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None + + +class TestActivatedLora: + @pytest.mark.parametrize( + "input_ids, alora_invocation_tokens, expected_offsets", + [ + ([[0, 1, 2, 3], [0, 4, 5, 6]], [1, 2], [3, None]), + ([[1, 2, 1, 2], [0, 4, 1, 2]], [1, 2], [2, 2]), + ([[1, 2, 3, 4], [0, 4, 1, 4]], [1, 2], [4, None]), + ([[1, 2, 3, 4]], None, [None]), + ], + ) + # Verify alora_offsets are calculated correctly + def test_calculate_alora_offsets(self, input_ids, alora_invocation_tokens, expected_offsets): + config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens) + peft_config = {"default": config} + + # compute offsets + offsets = calculate_alora_offsets(peft_config, "default", torch.tensor(input_ids)) + + assert offsets == expected_offsets + + @pytest.mark.parametrize( + "input_ids, alora_invocations, expected_offsets", + [ + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": [2]}, [1, 1]), + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": None}, [1, None]), + ], + ) + # Verify alora_offsets are correct with adapter names + def test_calculate_alora_offsets_with_adapter_names(self, input_ids, alora_invocations, expected_offsets): + peft_config = {} + for alora_name in alora_invocations.keys(): + peft_config[alora_name] = LoraConfig(alora_invocation_tokens=alora_invocations[alora_name]) + + adapter_names = list(alora_invocations.keys()) + offsets = calculate_alora_offsets( + peft_config, adapter_names[0], torch.tensor(input_ids), adapter_names=adapter_names + ) + + assert offsets == expected_offsets + + # Verify that the adapter does not modify outputs prior to invocation point + def test_alora_activation_matches_base_until_invocation(self): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + start = 2 + with lora_model.disable_adapter(): + with torch.no_grad(): + base_out = lora_model(X=input_ids) + + kwargs = get_alora_offsets_for_forward(lora_model, input_ids) + with torch.no_grad(): + lora_out = lora_model(X=input_ids, **kwargs) + assert torch.allclose(lora_out[:, :start], base_out[:, :start]) + assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) + + # Verify that warning is given for alora when providing embeddings only + def test_input_embeds_warning(self): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + input_embeds = base_model.embed(input_ids) + with pytest.warns( + UserWarning, + match="Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass.", + ): + kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None + with pytest.warns( + UserWarning, + match="Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.", + ): + kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None + + # Verify that error is raised when requesting num_beams > 1 for alora + def test_num_beams_error(self): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + with pytest.raises(ValueError) as e: + with torch.no_grad(): + lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) + assert "Beam search not yet supported for aLoRA." in str(e.value) diff --git a/peft/tests/test_lorafa.py b/peft/tests/test_lorafa.py new file mode 100644 index 0000000000000000000000000000000000000000..3f480049920d6982d48091c342cae98688ba8b16 --- /dev/null +++ b/peft/tests/test_lorafa.py @@ -0,0 +1,152 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import math + +import torch +from torch import nn + +from peft import LoraConfig, get_peft_model +from peft.optimizers import create_lorafa_optimizer + +from .testing_utils import torch_device + + +class SimpleNet(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.embedding = nn.Embedding(100, 20) + self.layer_norm = nn.LayerNorm(20) + self.lin0 = nn.Linear(20, 20, bias=bias) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 16, bias=bias) + + def forward(self, X): + X = self.lin0(self.layer_norm(self.embedding(X))) + X = self.relu(X) + X = self.lin1(X) + return X + + +def test_lorafa_init_default(): + """ + Test if the optimizer is correctly created + """ + lora_rank = 16 + lora_alpha = 32 + lr = 7e-5 + + model = SimpleNet() + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=["lin0", "lin1"], + bias="none", + ) + model = get_peft_model(model, config) + optimizer = create_lorafa_optimizer(model=model, r=lora_rank, lora_alpha=lora_alpha, lr=lr) + + assert math.isclose(optimizer.param_groups[0]["scaling_factor"], lora_alpha / lora_rank, rel_tol=1e-9, abs_tol=0.0) + + all_A_fixed = True + all_B_trainable = True + + assert optimizer is not None + + for name, param in model.named_parameters(): + if "lora_A" in name: + all_A_fixed &= not param.requires_grad + elif "lora_B" in name: + all_B_trainable &= param.requires_grad + + assert all_A_fixed and all_B_trainable + + +def test_lorafa_init_rslora(): + """ + Test if the optimizer is correctly created when use_rslora = True + """ + lora_rank = 16 + lora_alpha = 32 + lr = 7e-5 + + model = SimpleNet() + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=["lin0", "lin1"], + bias="none", + ) + model = get_peft_model(model, config) + optimizer = create_lorafa_optimizer(model=model, r=lora_rank, lora_alpha=lora_alpha, lr=lr, use_rslora=True) + assert math.isclose( + optimizer.param_groups[0]["scaling_factor"], lora_alpha / math.sqrt(lora_rank), rel_tol=1e-9, abs_tol=0.0 + ) + + +def test_LoraFAOptimizer_step(): + """ + Test if the optimizer's step function runs without any exception and checks specific conditions on lora_A and + lora_B weights. + """ + lora_rank = 16 + lora_alpha = 32 + lr = 7e-5 + num_steps = 5 + + model = SimpleNet() + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + target_modules=["lin0", "lin1"], + bias="none", + ) + model = get_peft_model(model, config).to(torch_device) + optimizer = create_lorafa_optimizer(model=model, r=16, lora_alpha=32, lr=7e-5) + loss = torch.nn.CrossEntropyLoss() + + # Save initial weights of lora_A + initial_lora_A_weights = {name: param.clone() for name, param in model.named_parameters() if "lora_A" in name} + # Ensure lora_B is initialized to zero + for name, param in model.named_parameters(): + if "lora_B" in name: + assert torch.all(param == 0), f"lora_B weights not initialized to zero for {name}" + + for _ in range(num_steps): # Run the optimizer step multiple times + # Generate random input and label for each step + x = torch.randint(100, (2, 4, 10)).to(torch_device) + output = model(x).permute(0, 3, 1, 2) + label = torch.randint(16, (2, 4, 10)).to(torch_device) + + # Calculate loss and perform backward pass + loss_value = loss(output, label) + loss_value.backward() + + # Perform optimizer step + optimizer.step() + + # Zero the gradients after each step to prevent accumulation + optimizer.zero_grad() + + # Check if lora_A weights have not changed + for name, param in model.named_parameters(): + if "lora_A" in name: + assert torch.equal(param, initial_lora_A_weights[name]), f"lora_A weights changed for {name}" + + # Check if lora_B weights are non-zero + for name, param in model.named_parameters(): + if "lora_B" in name: + assert torch.any(param != 0), f"lora_B weights are still zero for {name}" diff --git a/peft/tests/test_loraplus.py b/peft/tests/test_loraplus.py new file mode 100644 index 0000000000000000000000000000000000000000..64bb8bc307e58482366b63063ae1ce9cd921fe1b --- /dev/null +++ b/peft/tests/test_loraplus.py @@ -0,0 +1,99 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import torch +from torch import nn + +from peft.import_utils import is_bnb_available +from peft.optimizers import create_loraplus_optimizer + +from .testing_utils import require_bitsandbytes, torch_device + + +if is_bnb_available(): + import bitsandbytes as bnb + + +class SimpleNet(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.embedding = nn.Embedding(100, 20) + self.layer_norm = nn.LayerNorm(20) + self.lin0 = nn.Linear(20, 20, bias=bias) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 16, bias=bias) + + def forward(self, X): + X = self.lin0(self.layer_norm(self.embedding(X))) + X = self.relu(X) + X = self.lin1(X) + return X + + +@require_bitsandbytes +def test_lora_plus_helper_sucess(): + model = SimpleNet() + optimizer_cls = bnb.optim.Adam8bit + lr = 5e-5 + optim_config = { + "eps": 1e-6, + "betas": (0.9, 0.999), + "loraplus_weight_decay": 0.0, + } + loraplus_lr_ratio = 1.2 + loraplus_lr_embedding = 1e-6 + optim = create_loraplus_optimizer( + model=model, + optimizer_cls=optimizer_cls, + lr=lr, + loraplus_lr_ratio=loraplus_lr_ratio, + loraplus_lr_embedding=loraplus_lr_embedding, + **optim_config, + ) + assert optim is not None + assert len(optim.param_groups) == 4 + assert optim.param_groups[0]["lr"] == lr + assert optim.param_groups[1]["lr"] == loraplus_lr_embedding + assert optim.param_groups[2]["lr"] == optim.param_groups[3]["lr"] == (lr * loraplus_lr_ratio) + + +@require_bitsandbytes +def test_lora_plus_optimizer_sucess(): + """ + Test if the optimizer is correctly created and step function runs without any exception + """ + optimizer_cls = bnb.optim.Adam8bit + optim_config = { + "eps": 1e-6, + "betas": (0.9, 0.999), + "loraplus_weight_decay": 0.0, + } + model: SimpleNet = SimpleNet().to(torch_device) + optim = create_loraplus_optimizer( + model=model, + optimizer_cls=optimizer_cls, + lr=5e-5, + loraplus_lr_ratio=1.2, + loraplus_lr_embedding=1e-6, + **optim_config, + ) + loss = torch.nn.CrossEntropyLoss() + bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters()) + x = torch.randint(100, (2, 4, 10)).to(torch_device) + output = model(x).permute(0, 3, 1, 2) + label = torch.randint(16, (2, 4, 10)).to(torch_device) + loss_value = loss(output, label) + loss_value.backward() + optim.step() diff --git a/peft/tests/test_low_level_api.py b/peft/tests/test_low_level_api.py new file mode 100644 index 0000000000000000000000000000000000000000..0a097e2dba4928fee6f047afd4c3992832c8abf5 --- /dev/null +++ b/peft/tests/test_low_level_api.py @@ -0,0 +1,623 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import re + +import pytest +import torch +from diffusers import StableDiffusionPipeline +from torch import nn +from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification + +from peft import ( + AdaLoraConfig, + IA3Config, + LoKrConfig, + LoraConfig, + RandLoraConfig, + get_peft_model, + get_peft_model_state_dict, + inject_adapter_in_model, + set_peft_model_state_dict, +) +from peft.tuners import lora +from peft.utils import ModulesToSaveWrapper + +from .testing_utils import hub_online_once + + +class DummyModel(torch.nn.Module): + def __init__(self): + super().__init__() + self.embedding = torch.nn.Embedding(10, 10) + self.linear = torch.nn.Linear(10, 10) + self.linear2 = torch.nn.Linear(10, 10, bias=True) + self.lm_head = torch.nn.Linear(10, 10) + + def forward(self, input_ids): + x = self.embedding(input_ids) + x = self.linear(x) + x = self.lm_head(x) + return x + + +class TestLowLevelFunctional: + # Some simple tests for the low level API + @pytest.fixture + def model(self): + model = DummyModel() + + lora_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + target_modules=["linear"], + ) + + return inject_adapter_in_model(lora_config, model) + + def test_inject_adapter_in_model(self, model): + dummy_inputs = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]]) + _ = model(dummy_inputs) + + for name, module in model.named_modules(): + if name == "linear": + assert hasattr(module, "lora_A") + assert hasattr(module, "lora_B") + + def test_get_peft_model_state_dict(self, model): + peft_state_dict = get_peft_model_state_dict(model) + + for key in peft_state_dict.keys(): + assert "lora" in key + + def test_modules_to_save(self): + model = DummyModel() + + lora_config = LoraConfig( + lora_alpha=16, + lora_dropout=0.1, + r=64, + bias="none", + target_modules=["linear"], + modules_to_save=["embedding", "linear2"], + ) + + model = inject_adapter_in_model(lora_config, model) + + for name, module in model.named_modules(): + if name == "linear": + assert hasattr(module, "lora_A") + assert hasattr(module, "lora_B") + elif name in ["embedding", "linear2"]: + assert isinstance(module, ModulesToSaveWrapper) + + state_dict = get_peft_model_state_dict(model) + + assert "embedding.weight" in state_dict.keys() + + assert hasattr(model.embedding, "weight") + + assert hasattr(model.linear2, "weight") + assert hasattr(model.linear2, "bias") + + +class TestInjectAdapterFromStateDict: + # The inject_adapter_in_model function can determine the target modules based on the LoraConfig (default) or based + # on a state_dict (or rather, the state_dict keys). Here we test that the latter works as expected. + + # We test a subset of model classes and PEFT configs, testing everything would be excessive + @pytest.mark.parametrize( + "model_cls_and_id", + [ + (AutoModelForCausalLM, "trl-internal-testing/tiny-random-LlamaForCausalLM"), + (AutoModel, "hf-internal-testing/tiny-random-BertModel"), + (AutoModelForSeq2SeqLM, "hf-internal-testing/tiny-random-BartForConditionalGeneration"), + (AutoModelForSequenceClassification, "hf-internal-testing/tiny-random-RobertaForSequenceClassification"), + ], + ids=["Llama", "Bert", "Bart", "Roberta"], + ) + @pytest.mark.parametrize( + "config", + [ + AdaLoraConfig(total_step=5), + IA3Config(), + LoKrConfig(), + LoraConfig(), + RandLoraConfig(), + ], + ids=["AdaLoRA", "IA3", "LoKr", "LoRA", "RandLoRA"], + ) + def test_inject_from_state_dict_and_from_config_target_same_layers(self, model_cls_and_id, config, recwarn): + model_cls, model_id = model_cls_and_id + config = copy.deepcopy(config) # since PEFT may mutate it + + with hub_online_once(model_id): + # use config for injection + model = model_cls.from_pretrained(model_id) + model = inject_adapter_in_model(config, model) + sd_before = get_peft_model_state_dict(model) + del model + + model = model_cls.from_pretrained(model_id) + # get other warnings, if any, out of the way + recwarn.clear() + # assure that this doesn't cause any warnings + model = inject_adapter_in_model(config, model, state_dict=sd_before) + assert not recwarn.list + + sd_after = get_peft_model_state_dict(model) + + # We exepct the same keys and the same shapes of the weights. Don't check the values: injection is only + # about creating the PEFT adapter, not about loading the actual weights + assert len(sd_before) > 0 + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert sd_before[key].shape == sd_after[key].shape + + def test_inject_from_state_dict_transformers(self): + model_id = "facebook/opt-125m" + config = LoraConfig() + + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model.add_adapter(config) + sd_before = get_peft_model_state_dict(model) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model, state_dict=sd_before) + sd_after = get_peft_model_state_dict(model) + + # We exepct the same keys and the same shapes of the weights. Don't check the values: injection is only + # about creating the PEFT adapter, not about loading the actual weights + assert len(sd_before) > 0 + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert sd_before[key].shape == sd_after[key].shape + + def test_inject_from_state_dict_transformers_irregular_targets(self): + # ensure that this works even if an "irregular" pattern is used, i.e. only targeting some modules on some layers + model_id = "facebook/opt-125m" + config = LoraConfig( + target_modules=r".*\.[0-5]\.self_attn\.v_proj|.*\.[4-7]\.self_attn\.k_proj", + ) + + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model.add_adapter(config) + sd_before = get_peft_model_state_dict(model) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model, state_dict=sd_before) + sd_after = get_peft_model_state_dict(model) + + # We exepct the same keys and the same shapes of the weights. Don't check the values: injection is only + # about creating the PEFT adapter, not about loading the actual weights + assert len(sd_before) > 0 + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert sd_before[key].shape == sd_after[key].shape + + def test_inject_from_state_dict_transformers_target_parameters_raises(self): + # Injecting from state_dict does not correctly identify target_parameters. This is because, just from looking at + # the state_dict, we cannot tell if the user intends to use target_modules or target_parameters. Currently, we + # just assume the former, thus applying normal lora.Linear etc. layers instead of lora.ParamWrapper. When we + # detect that the user tries to do this, we raise an error. + model_id = "facebook/opt-125m" + config = LoraConfig(target_modules=[], target_parameters=["q_proj.weight", "v_proj.weight"]) + + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model.add_adapter(config) + sd = get_peft_model_state_dict(model) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id) + msg = "Trying to inject a PEFT adapter from a state_dict but the PEFT config uses `target_parameters`" + with pytest.raises(ValueError, match=msg): + inject_adapter_in_model(config, model, state_dict=sd) + + @pytest.mark.xfail( + reason="Loading from state_dict with target_parameters fails", raises=AssertionError, strict=True + ) + def test_inject_from_state_dict_transformers_target_parameters_fails(self): + # Injecting from state_dict does not correctly identify target_parameters. This is because, just from looking at + # the state_dict, we cannot tell if the user intends to use target_modules or target_parameters. Currently, we + # just assume the former, thus applying normal lora.Linear etc. layers instead of lora.ParamWrapper. When we + # don't detect that the user tries to do this, there is nothing that can be done. + model_id = "facebook/opt-125m" + config = LoraConfig(target_modules=[], target_parameters=["q_proj.weight", "v_proj.weight"]) + + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model.add_adapter(config) + # sanity check: + for name, module in model.named_modules(): + if name.endswith((".q_proj", ".v_proj")): + assert isinstance(module, lora.ParamWrapper) + + sd_before = get_peft_model_state_dict(model) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig() # no target_parameters defined, we cannot know the original intent + model = inject_adapter_in_model(config, model, state_dict=sd_before) + sd_after = get_peft_model_state_dict(model) + + # this fails, we get lora.Linear instances + for name, module in model.named_modules(): + if name.endswith((".q_proj", ".v_proj")): + assert isinstance(module, lora.ParamWrapper) + + def test_inject_from_state_dict_stable_diffusion(self): + # same test as above, but with stable diffusion model and only testing LoRA + model_id = "hf-internal-testing/tiny-sd-pipe" + config_text_encoder = LoraConfig(target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"]) + config_unet = LoraConfig( + target_modules=[ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ] + ) + with hub_online_once(model_id): + pipe = StableDiffusionPipeline.from_pretrained(model_id) + pipe.text_encoder.add_adapter(config_text_encoder) + pipe.unet.add_adapter(config_unet) + + sd_te_before = get_peft_model_state_dict(pipe.text_encoder) + sd_unet_before = get_peft_model_state_dict(pipe.unet) + del pipe + + pipe = StableDiffusionPipeline.from_pretrained(model_id) + inject_adapter_in_model(config_text_encoder, pipe.text_encoder, state_dict=sd_te_before) + inject_adapter_in_model(config_unet, pipe.unet, state_dict=sd_unet_before) + + sd_te_after = get_peft_model_state_dict(pipe.text_encoder) + sd_unet_after = get_peft_model_state_dict(pipe.unet) + + # We exepct the same keys and the same shapes of the weights. Don't check the values: injection is only + # about creating the PEFT adapter, not about loading the actual weights + assert len(sd_te_before) > 0 + assert sd_te_before.keys() == sd_te_after.keys() + for key in sd_te_before.keys(): + assert sd_te_before[key].shape == sd_te_after[key].shape + + assert len(sd_unet_before) > 0 + assert sd_unet_before.keys() == sd_unet_after.keys() + for key in sd_unet_before.keys(): + assert sd_unet_before[key].shape == sd_unet_after[key].shape + + def test_inject_from_state_dict_low_cpu_mem_usage(self): + model_id = "facebook/opt-125m" + config = LoraConfig() + + with hub_online_once(model_id): + # use config for injection + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model) + sd_before = get_peft_model_state_dict(model) + del model + + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model, state_dict=sd_before, low_cpu_mem_usage=True) + # all PEFT parameters should be on meta device + assert {p.device.type for p in get_peft_model_state_dict(model).values()} == {"meta"} + + def test_inject_from_state_dict_missing_keys_warning(self): + # check that if the PEFT config specifies **more** taget modules than the state_dict, we get a warning for that + model_id = "facebook/opt-125m" + config = LoraConfig() + + with hub_online_once(model_id): + # use config for injection + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model) + sd_before = get_peft_model_state_dict(model) + del model + + # delete a keys for one module from state_dict + del sd_before["model.decoder.layers.5.self_attn.q_proj.lora_A.weight"] + del sd_before["model.decoder.layers.5.self_attn.q_proj.lora_B.weight"] + + model = AutoModelForCausalLM.from_pretrained(model_id) + msg = re.escape( + "While injecting the PEFT adapters, an inconsistency was discovered between the PEFT config and " + "the provided state_dict. This is not necessarily an issue and can be ignored if this was the " + "intent. The PEFT config contained these additional target modules: " + "['model.decoder.layers.5.self_attn.q_proj']. " + ) + + with pytest.warns(RuntimeWarning, match=msg): # as rec:#(UserWarning, match=msg) as rec: + model = inject_adapter_in_model(config, model, state_dict=sd_before, low_cpu_mem_usage=True) + + # besides the warning, the rest of the injection should work + sd_after = get_peft_model_state_dict(model) + assert len(sd_before) > 0 + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert sd_before[key].shape == sd_after[key].shape + + def test_inject_from_state_dict_extra_keys_warning(self): + # check that if the PEFT config specifies **fewer** taget modules than the state_dict, we get a warning for that + model_id = "facebook/opt-125m" + config = LoraConfig() + + with hub_online_once(model_id): + # use config for injection + model = AutoModelForCausalLM.from_pretrained(model_id) + model = inject_adapter_in_model(config, model) + sd_before = get_peft_model_state_dict(model) + del model + + # remove q_proj of layer 5 from the PEFT config + config.exclude_modules = ["model.decoder.layers.5.self_attn.q_proj"] + + model = AutoModelForCausalLM.from_pretrained(model_id) + msg = re.escape( + "While injecting the PEFT adapters, an inconsistency was discovered between the PEFT config and " + "the provided state_dict. This is not necessarily an issue and can be ignored if this was the " + "intent. The state_dict contained these additional target modules: " + "['model.decoder.layers.5.self_attn.q_proj']. " + ) + + with pytest.warns(RuntimeWarning, match=msg): + model = inject_adapter_in_model(config, model, state_dict=sd_before, low_cpu_mem_usage=True) + + # besides the warning, the rest of the injection should work + sd_after = get_peft_model_state_dict(model) + assert len(sd_before) > 0 + assert sd_before.keys() == sd_after.keys() + for key in sd_before.keys(): + assert sd_before[key].shape == sd_after[key].shape + + +class TestPeftStateDict: + # Test some edge cases around getting and setting the PEFT state_dict. There are potential sources of errors there + # because the adapter_name is removed from/added to the state_dict keys. + def test_get_peft_model_state_dict_removes_adapter_name(self): + # ensure that the adapter name, "default", is removed from the state_dict + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + + # note: lora targets q_proj and v_proj; add in an auxiliary module for good measure + model = get_peft_model(model, LoraConfig(modules_to_save=["lm_head"])) + sd = get_peft_model_state_dict(model) + assert len(sd) > 1 # sanity check + assert not any("default" in key for key in sd) + + def test_get_peft_model_state_dict_removes_non_defaul_adapter_name(self): + # ensure that the adapter name is removed from the state_dict, even if it's not "default" + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + + model = get_peft_model(model, LoraConfig(modules_to_save=["lm_head"]), adapter_name="other") + sd = get_peft_model_state_dict(model, adapter_name="other") + assert len(sd) > 1 # sanity check + assert not any("other" in key for key in sd) + + def test_get_peft_model_state_dict_removes_adapter_name_when_same_as_module_name(self): + # here the adapter is named "v_proj", which is the same name as some modules targeted with lora in the model, + # which is nefarious + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + + config = LoraConfig(modules_to_save=["lm_head"], target_modules=["v_proj"]) + model = get_peft_model(model, config, adapter_name="v_proj") + sd = get_peft_model_state_dict(model, adapter_name="v_proj") + assert len(sd) > 1 # sanity check + for key in sd: + # assert that the adapter_name was indeed removed + assert not key.endswith("lora_A.v_proj.weight") + assert not key.endswith("lora_B.v_proj.weight") + assert not key.endswith("modules_to_save.v_proj.weight") + # assert that the module name was not stripped completely from the key + assert ("v_proj" in key) or ("q_proj" in key) or ("lm_head") in key + + def check_peft_model_weights_loaded_correctly(self, inner_model_cls, config, nested, adapter_name="default"): + # Runs checks that a roundtrip of get_peft_model_state_dict and set_peft_model_state_dict results in the same + # model (same outputs and same weights). + class Outer(nn.Module): + def __init__(self): + super().__init__() + self.inner = inner_model_cls() + + def forward(self, x): + return self.inner(x) + + if nested: + # add another layer of nesting + model_cls = Outer + else: + model_cls = inner_model_cls + + x = torch.randn(1, 5) + + torch.manual_seed(0) + base_model = model_cls() + with torch.inference_mode(): + base_out = base_model(x) + + torch.manual_seed(42) + model = get_peft_model(base_model, config, adapter_name=adapter_name) + with torch.inference_mode(): + peft_out = model(x) + # sanity check: peft adapter has an effect + assert not torch.allclose(base_out, peft_out, atol=1e-6) + + sd = get_peft_model_state_dict(model, adapter_name=adapter_name) + + torch.manual_seed(0) + base_model = model_cls() + torch.manual_seed(42 + 1) # ensure we start with a different, randomly initialized PEFT model + model_new = get_peft_model(base_model, config, adapter_name=adapter_name) + with torch.inference_mode(): + peft_new = model_new(x) + assert not torch.allclose(peft_out, peft_new, atol=1e-6) + + set_peft_model_state_dict(model_new, sd, adapter_name=adapter_name) + with torch.inference_mode(): + peft_out_loaded = model_new(x) + assert torch.allclose(peft_out, peft_out_loaded, atol=1e-6) + + sd_new = get_peft_model_state_dict(model, adapter_name=adapter_name) + assert sd.keys() == sd_new.keys() + for key, val in sd.items(): + val_new = sd_new[key] + torch.allclose(val, val_new) + + @pytest.mark.parametrize("nested", [False, True]) + def test_get_and_set_peft_model_state_dict_normal_names(self, nested): + # In this test, there is no edge case. Therefore, this test is basically the "control group" for the subsequent + # tests (if this test were to fail, it means the testing code itself is wrong). + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.foo_linear = nn.Linear(5, 5) + self.foo_baz = nn.Linear(5, 5) + self.baz_foo = nn.Linear(5, 5) + self.foo_baz_foo = nn.Linear(5, 5) + self.baz_foo_baz = nn.Linear(5, 5) + + def forward(self, x): + x = self.foo_linear(x) + x = self.foo_baz(x) + x = self.baz_foo(x) + x = self.foo_baz_foo(x) + x = self.baz_foo_baz(x) + return x + + config = LoraConfig( + target_modules=["foo_linear", "foo_baz", "baz_foo", "foo_baz_foo", "baz_foo_baz"], init_lora_weights=False + ) + self.check_peft_model_weights_loaded_correctly(MyModel, config, nested=nested) + + @pytest.mark.parametrize("nested", [False, True]) + def test_get_and_set_peft_model_state_dict_peft_prefix_in_module_name(self, nested): + # Here we have a model with some modules containing "lora" in their name. + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.foo_linear = nn.Linear(5, 5) + self.foo_lora = nn.Linear(5, 5) + self.lora_foo = nn.Linear(5, 5) + self.foo_lora_foo = nn.Linear(5, 5) + self.lora_foo_lora = nn.Linear(5, 5) + + def forward(self, x): + x = self.foo_linear(x) + x = self.foo_lora(x) + x = self.lora_foo(x) + x = self.foo_lora_foo(x) + x = self.lora_foo_lora(x) + return x + + config = LoraConfig( + target_modules=["foo_linear", "foo_lora", "lora_foo", "foo_lora_foo", "lora_foo_lora"], + init_lora_weights=False, + ) + self.check_peft_model_weights_loaded_correctly(MyModel, config, nested=nested) + + @pytest.mark.parametrize("nested", [False, True]) + def test_get_and_set_peft_model_state_dict_weight_in_module_name(self, nested): + # Here we have a model with some modules containing "weight" in their name. + # See #2772 + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.foo_linear = nn.Linear(5, 5) + self.foo_weight = nn.Linear(5, 5) + self.weight_foo = nn.Linear(5, 5) + self.foo_weight_foo = nn.Linear(5, 5) + self.weight_foo_weight = nn.Linear(5, 5) + + def forward(self, x): + x = self.foo_linear(x) + x = self.foo_weight(x) + x = self.weight_foo(x) + x = self.foo_weight_foo(x) + x = self.weight_foo_weight(x) + return x + + config = LoraConfig( + target_modules=["foo_linear", "foo_weight", "weight_foo", "foo_weight_foo", "weight_foo_weight"], + init_lora_weights=False, + ) + self.check_peft_model_weights_loaded_correctly(MyModel, config, nested=nested) + + @pytest.mark.parametrize("nested", [False, True]) + def test_get_and_set_peft_model_state_dict_bias_in_module_name(self, nested): + # Here we have a model with some modules containing "bias" in their name. + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.foo_linear = nn.Linear(5, 5) + self.foo_bias = nn.Linear(5, 5) + self.bias_foo = nn.Linear(5, 5) + self.foo_bias_foo = nn.Linear(5, 5) + self.bias_foo_bias = nn.Linear(5, 5) + + def forward(self, x): + x = self.foo_linear(x) + x = self.foo_bias(x) + x = self.bias_foo(x) + x = self.foo_bias_foo(x) + x = self.bias_foo_bias(x) + return x + + config = LoraConfig( + target_modules=["foo_linear", "foo_bias", "bias_foo", "foo_bias_foo", "bias_foo_bias"], + init_lora_weights=False, + bias="lora_only", + ) + self.check_peft_model_weights_loaded_correctly(MyModel, config, nested=nested) + + @pytest.mark.parametrize("nested", [False, True]) + def test_get_and_set_peft_model_state_dict_adapter_name_same_as_module_name(self, nested): + # Here we choose a module name that is identical to the name of one of the adapters. + class MyModel(nn.Module): + def __init__(self): + super().__init__() + self.foo = nn.Linear(5, 5) + self.foo_baz = nn.Linear(5, 5) + self.baz_foo = nn.Linear(5, 5) + self.foo_baz_foo = nn.Linear(5, 5) + self.baz_foo_baz = nn.Linear(5, 5) + + def forward(self, x): + x = self.foo(x) + x = self.foo_baz(x) + x = self.baz_foo(x) + x = self.foo_baz_foo(x) + x = self.baz_foo_baz(x) + return x + + config = LoraConfig( + target_modules=["foo", "foo_baz", "baz_foo", "foo_baz_foo", "baz_foo_baz"], init_lora_weights=False + ) + self.check_peft_model_weights_loaded_correctly(MyModel, config, nested=nested, adapter_name="foo") diff --git a/peft/tests/test_mapping.py b/peft/tests/test_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..6e204b951058ca8df79722211db19d18b9d8a233 --- /dev/null +++ b/peft/tests/test_mapping.py @@ -0,0 +1,55 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from peft import LoraConfig, get_peft_model + + +class TestGetPeftModel: + RELOAD_WARNING_EXPECTED_MATCH = r"You are trying to modify a model .*" + + @pytest.fixture + def lora_config_0(self): + return LoraConfig(target_modules="0") + + @pytest.fixture + def base_model(self): + return torch.nn.Sequential(torch.nn.Linear(10, 2), torch.nn.Linear(2, 10)) + + def test_get_peft_model_warns_when_reloading_model(self, lora_config_0, base_model): + get_peft_model(base_model, lora_config_0) + + with pytest.warns(UserWarning, match=self.RELOAD_WARNING_EXPECTED_MATCH): + get_peft_model(base_model, lora_config_0) + + def test_get_peft_model_proposed_fix_in_warning_helps(self, lora_config_0, base_model, recwarn): + peft_model = get_peft_model(base_model, lora_config_0) + peft_model.unload() + get_peft_model(base_model, lora_config_0) + + warning_checker = pytest.warns(UserWarning, match=self.RELOAD_WARNING_EXPECTED_MATCH) + + for warning in recwarn: + if warning_checker.matches(warning): + pytest.fail("Warning raised even though model was unloaded.") + + def test_get_peft_model_repeated_invocation(self, lora_config_0, base_model): + peft_model = get_peft_model(base_model, lora_config_0) + + # use direct-addressing of the other layer to accomodate for the nested model + lora_config_1 = LoraConfig(target_modules="base_model.model.1") + + with pytest.warns(UserWarning, match=self.RELOAD_WARNING_EXPECTED_MATCH): + get_peft_model(peft_model, lora_config_1) diff --git a/peft/tests/test_mixed.py b/peft/tests/test_mixed.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec18387c817e148f6346cf5f57a9a993b2d96cf --- /dev/null +++ b/peft/tests/test_mixed.py @@ -0,0 +1,791 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import itertools +import os +import platform +import re +import tempfile +import unittest + +import pytest +import torch +from parameterized import parameterized +from torch import nn +from transformers import AutoModelForCausalLM + +from peft import ( + AdaLoraConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + PeftMixedModel, + PrefixTuningConfig, + get_peft_model, +) +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import infer_device + + +class SimpleNet(nn.Module): + def __init__(self, bias=True): + super().__init__() + # note: out_features must be > rank or else OFT will be an identity transform + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 16, bias=bias) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + return X + + +def _param_name_func(testcase_func, param_num, params): + # for parameterized tests in TextMixedAdapterTypes + config0, config1 = params[0] + name0 = config0.__class__.__name__[: -len("Config")] + name1 = config1.__class__.__name__[: -len("Config")] + if name0 != name1: + return f"{testcase_func.__name__}_{param_num}_{name0}_{name1}" + return f"{testcase_func.__name__}_{param_num}_{name0}_x2" + + +class TestMixedAdapterTypes(unittest.TestCase): + torch_device = infer_device() + + def _get_model(self, model_cls, peft_config=None, adapter_name=None, seed=0, mixed=True): + torch.manual_seed(0) # always use seed 0 for base model, seed for adapters may differ + base_model = model_cls().eval().to(self.torch_device) + if peft_config is None: + return base_model + + torch.manual_seed(seed) + assert adapter_name is not None + peft_model = get_peft_model(base_model, peft_config, adapter_name=adapter_name, mixed=mixed) + return peft_model.eval().to(self.torch_device) + + def _check_mixed_outputs(self, model_cls, config0, config1, input, *, is_commutative): + # This test checks different combinations of adapter0, adapter1, or combinations of the two, and whether + # outputs are the same/different, depending on context. If we pass is_commutative=True, it means that the order + # of adapters does not matter, and we expect the same output regardless of the order in which adapters are + # applied. + # We have to very careful with resetting the random seed each time it is used, otherwise the adapters may be + # initialized with different values, and the test will fail. + + atol = 1e-5 + rtol = 1e-5 + seed0 = 0 + seed1 = 1 + + # base model + base_model = self._get_model(model_cls) + output_base = base_model(input) + assert torch.isfinite(output_base).all() + + # adapter 0 + peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + output_config0 = peft_model_0(input) + + assert torch.isfinite(output_config0).all() + assert not torch.allclose(output_base, output_config0, atol=atol, rtol=rtol) + + # adapter 1 + peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + output_config1 = peft_model_1(input) + + assert torch.isfinite(output_config1).all() + assert not torch.allclose(output_base, output_config1, atol=atol, rtol=rtol) + assert not torch.allclose(output_config0, output_config1, atol=atol, rtol=rtol) + + # adapter 0 + 1 + peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + torch.manual_seed(seed1) + peft_model_01.add_adapter("adapter1", config1) + peft_model_01.set_adapter(["adapter0", "adapter1"]) + output_mixed_01 = peft_model_01(input) + + # check the number of tuner layer types + tuner_layers = [mod for mod in peft_model_01.modules() if isinstance(mod, BaseTunerLayer)] + tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers} + if type(config0) is type(config1): + assert len(tuner_types) == 1 + else: + assert len(tuner_types) == 2 + + assert peft_model_01.active_adapters == ["adapter0", "adapter1"] + assert torch.isfinite(output_mixed_01).all() + assert not torch.allclose(output_config0, output_mixed_01, atol=atol, rtol=rtol) + assert not torch.allclose(output_config1, output_mixed_01, atol=atol, rtol=rtol) + if is_commutative: + delta0 = output_config0 - output_base + delta1 = output_config1 - output_base + delta_mixed_01 = output_mixed_01 - output_base + assert torch.allclose((delta0 + delta1), delta_mixed_01, atol=atol, rtol=rtol) + + # adapter 1 + 0 + peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + torch.manual_seed(seed0) + peft_model_10.add_adapter("adapter0", config0) + peft_model_10.set_adapter(["adapter1", "adapter0"]) + output_mixed_10 = peft_model_10(input) + + # check the number of tuner layer types + tuner_layers = [mod for mod in peft_model_10.modules() if isinstance(mod, BaseTunerLayer)] + tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers} + if type(config0) is type(config1): + assert len(tuner_types) == 1 + else: + assert len(tuner_types) == 2 + + assert peft_model_10.active_adapters == ["adapter1", "adapter0"] + assert torch.isfinite(output_mixed_10).all() + assert not torch.allclose(output_config0, output_mixed_10, atol=atol, rtol=rtol) + assert not torch.allclose(output_config1, output_mixed_10, atol=atol, rtol=rtol) + if is_commutative: + assert torch.allclose(output_mixed_01, output_mixed_10, atol=atol, rtol=rtol) + + # turn around the order of the adapters of the 0 + 1 mixed model, should behave like the 0 + 1 mixed model + peft_model_10.set_adapter(["adapter0", "adapter1"]) + output_mixed_reversed = peft_model_10(input) + + # check the number of tuner layer types + tuner_layers = [mod for mod in peft_model_10.modules() if isinstance(mod, BaseTunerLayer)] + tuner_types = {type(tuner_layer) for tuner_layer in tuner_layers} + if type(config0) is type(config1): + assert len(tuner_types) == 1 + else: + assert len(tuner_types) == 2 + + assert peft_model_10.active_adapters == ["adapter0", "adapter1"] + assert torch.isfinite(output_mixed_reversed).all() + assert not torch.allclose(output_mixed_reversed, output_config0, atol=atol, rtol=rtol) + assert not torch.allclose(output_mixed_reversed, output_config1, atol=atol, rtol=rtol) + if is_commutative: + assert torch.allclose(output_mixed_reversed, output_mixed_01, atol=atol, rtol=rtol) + assert torch.allclose(output_mixed_reversed, output_mixed_10, atol=atol, rtol=rtol) + + def _check_merging(self, model_cls, config0, config1, input): + # Ensure that when merging mixed adapters, the result is the same as when applying the adapters separately. + # Merging requires a bit higher tolerance for some adapters, which can also vary depending on CPU vs GPU. + atol = 1e-4 + rtol = 1e-4 + seed0 = 0 + seed1 = 1 + + # adapter 0 + 1 + peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + torch.manual_seed(seed1) + peft_model_01.add_adapter("adapter1", config1) + peft_model_01.set_adapter(["adapter0", "adapter1"]) + output_mixed_01 = peft_model_01(input) + + model_merged_01 = peft_model_01.merge_and_unload() + output_merged_01 = model_merged_01(input) + assert torch.allclose(output_mixed_01, output_merged_01, atol=atol, rtol=rtol) + + # adapter 1 + 0 + peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + torch.manual_seed(seed0) + peft_model_10.add_adapter("adapter0", config0) + peft_model_10.set_adapter(["adapter1", "adapter0"]) + output_mixed_10 = peft_model_10(input) + + model_merged_10 = peft_model_10.merge_and_unload() + output_merged_10 = model_merged_10(input) + assert torch.allclose(output_mixed_10, output_merged_10, atol=atol, rtol=rtol) + + def _check_unload(self, model_cls, config0, config1, input): + # Ensure that we can unload the base model without merging + atol = 1e-5 + rtol = 1e-5 + seed0 = 0 + seed1 = 1 + + base_model = self._get_model(model_cls) + output_base = base_model(input) + + # adapter 0 + 1 + peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + torch.manual_seed(seed1) + peft_model_01.add_adapter("adapter1", config1) + peft_model_01.set_adapter(["adapter0", "adapter1"]) + output_mixed = peft_model_01(input) + + # unload + model_unloaded = peft_model_01.unload() + output_unloaded = model_unloaded(input) + + assert not torch.allclose(output_mixed, output_unloaded, atol=atol, rtol=rtol) + assert torch.allclose(output_base, output_unloaded, atol=atol, rtol=rtol) + + def _check_disable(self, model_cls, config0, config1, input): + # Ensure that we can disable adapters + atol = 1e-5 + rtol = 1e-5 + seed0 = 0 + seed1 = 1 + + # base model + base_model = self._get_model(model_cls) + output_base = base_model(input) + + # adapter 0 + peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + output_config0 = peft_model_0(input) + with peft_model_0.disable_adapter(): + output_disabled0 = peft_model_0(input) + + assert not torch.allclose(output_base, output_config0, atol=atol, rtol=rtol) + assert torch.allclose(output_base, output_disabled0, atol=atol, rtol=rtol) + + # adapter 1 + peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + output_config1 = peft_model_1(input) + with peft_model_1.disable_adapter(): + output_disabled1 = peft_model_1(input) + + assert not torch.allclose(output_base, output_config1, atol=atol, rtol=rtol) + assert torch.allclose(output_base, output_disabled1, atol=atol, rtol=rtol) + + # adapter 0 + 1 + peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + torch.manual_seed(seed1) + peft_model_01.add_adapter("adapter1", config1) + peft_model_01.set_adapter(["adapter0", "adapter1"]) + output_mixed_01 = peft_model_01(input) + with peft_model_01.disable_adapter(): + output_disabled01 = peft_model_01(input) + + assert not torch.allclose(output_base, output_mixed_01, atol=atol, rtol=rtol) + assert torch.allclose(output_base, output_disabled01, atol=atol, rtol=rtol) + + # adapter 1 + 0 + peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + torch.manual_seed(seed0) + peft_model_10.add_adapter("adapter0", config0) + peft_model_10.set_adapter(["adapter1", "adapter0"]) + output_mixed_10 = peft_model_10(input) + with peft_model_10.disable_adapter(): + output_disabled10 = peft_model_10(input) + + assert not torch.allclose(output_base, output_mixed_10, atol=atol, rtol=rtol) + assert torch.allclose(output_base, output_disabled10, atol=atol, rtol=rtol) + + def _check_loading(self, model_cls, config0, config1, input, *, is_commutative): + # Check that we can load two adapters into the same model + # Note that we save the adapters using a normal PeftModel because PeftMixModel doesn't support saving yet + atol = 1e-5 + rtol = 1e-5 + seed0 = 0 + seed1 = 1 + + with tempfile.TemporaryDirectory() as tmp_dirname: + # SAVING + # adapter 0: note that we set mixed=False because mixed models don't support saving (yet) + peft_model_0 = self._get_model(model_cls, config0, "adapter0", seed=seed0, mixed=False) + output_config0 = peft_model_0(input) + peft_model_0.save_pretrained(os.path.join(tmp_dirname, "adapter0")) + + # adapter 1: note that we set mixed=False because mixed models don't support saving (yet) + peft_model_1 = self._get_model(model_cls, config1, "adapter1", seed=seed1, mixed=False) + output_config1 = peft_model_1(input) + peft_model_1.save_pretrained(os.path.join(tmp_dirname, "adapter1")) + + # adapter 0 + 1 + peft_model_01 = self._get_model(model_cls, config0, "adapter0", seed=seed0) + torch.manual_seed(seed1) + peft_model_01.add_adapter("adapter1", config1) + peft_model_01.set_adapter(["adapter0", "adapter1"]) + output_mixed_01 = peft_model_01(input) + + # adapter 1 + 0 + peft_model_10 = self._get_model(model_cls, config1, "adapter1", seed=seed1) + torch.manual_seed(seed0) + peft_model_10.add_adapter("adapter0", config0) + peft_model_10.set_adapter(["adapter1", "adapter0"]) + output_mixed_10 = peft_model_10(input) + + # LOADING + # adapter 0 + base_model = self._get_model(model_cls) + # Notes: + # Path is tmp_dirname/adapter0/adapter0 because non-default adapters are saved in a subfolder. + # As a sanity check, we should set a completely different seed here. That way, we ensure that the the + # weights are not just randomly initialized exactly to the same values as before. + torch.manual_seed(123456) + peft_model_loaded0 = PeftMixedModel.from_pretrained( + base_model, os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0" + ) + output_loaded0 = peft_model_loaded0(input) + assert torch.allclose(output_config0, output_loaded0, atol=atol, rtol=rtol) + + # adapter 1 + base_model = self._get_model(model_cls) + torch.manual_seed(654321) # setting a completely different seed here should not affect the result + peft_model_loaded1 = PeftMixedModel.from_pretrained( + base_model, os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1" + ) + output_loaded1 = peft_model_loaded1(input) + assert torch.allclose(output_config1, output_loaded1, atol=atol, rtol=rtol) + + # adapter 0 + 1 + base_model = self._get_model(model_cls) + torch.manual_seed(97531) # setting a completely different seed here should not affect the result + peft_model_loaded_01 = PeftMixedModel.from_pretrained( + base_model, os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0" + ) + peft_model_loaded_01.load_adapter(os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1") + # at this point, "adapter0" should still be active + assert peft_model_loaded_01.active_adapters == ["adapter0"] + output_loaded01_0 = peft_model_loaded_01(input) + assert torch.allclose(output_config0, output_loaded01_0, atol=atol, rtol=rtol) + # activate adapter1 + peft_model_loaded_01.set_adapter(["adapter1"]) + assert peft_model_loaded_01.active_adapters == ["adapter1"] + output_loaded01_1 = peft_model_loaded_01(input) + assert torch.allclose(output_config1, output_loaded01_1, atol=atol, rtol=rtol) + # activate both adapters + peft_model_loaded_01.set_adapter(["adapter0", "adapter1"]) + output_loaded01 = peft_model_loaded_01(input) + assert torch.allclose(output_mixed_01, output_loaded01, atol=atol, rtol=rtol) + + # adapter 1 + 0 + base_model = self._get_model(model_cls) + torch.manual_seed(445566) # setting a completely different seed here should not affect the result + peft_model_loaded_10 = PeftMixedModel.from_pretrained( + base_model, os.path.join(tmp_dirname, "adapter1", "adapter1"), "adapter1" + ) + peft_model_loaded_10.load_adapter(os.path.join(tmp_dirname, "adapter0", "adapter0"), "adapter0") + # at this point, "adapter1" should still be active + assert peft_model_loaded_10.active_adapters == ["adapter1"] + output_loaded10_1 = peft_model_loaded_10(input) + assert torch.allclose(output_config1, output_loaded10_1, atol=atol, rtol=rtol) + # activate adapter1 + peft_model_loaded_10.set_adapter(["adapter0"]) + assert peft_model_loaded_10.active_adapters == ["adapter0"] + output_loaded10_0 = peft_model_loaded_10(input) + assert torch.allclose(output_config0, output_loaded10_0, atol=atol, rtol=rtol) + # activate both adapters + peft_model_loaded_10.set_adapter(["adapter1", "adapter0"]) + output_loaded10 = peft_model_loaded_10(input) + assert torch.allclose(output_mixed_10, output_loaded10, atol=atol, rtol=rtol) + + if is_commutative: + assert torch.allclose(output_loaded01, output_loaded10, atol=atol, rtol=rtol) + assert torch.allclose(output_loaded10, output_mixed_01, atol=atol, rtol=rtol) + + @parameterized.expand( + itertools.combinations( + [ + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoHaConfig(target_modules=["lin0"], init_weights=False), + LoKrConfig(target_modules=["lin0"], init_weights=False), + AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), + ], + r=2, + ), + name_func=_param_name_func, + ) + def test_target_first_layer(self, config0, config1): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False) + self._check_merging(SimpleNet, config0, config1, input) + self._check_unload(SimpleNet, config0, config1, input) + self._check_disable(SimpleNet, config1, config0, input) + self._check_loading(SimpleNet, config0, config1, input, is_commutative=False) + + @parameterized.expand( + itertools.combinations( + [ + LoraConfig(target_modules=["lin1"], init_lora_weights=False), + LoHaConfig(target_modules=["lin1"], init_weights=False), + LoKrConfig(target_modules=["lin1"], init_weights=False), + AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False, total_step=1), + ], + r=2, + ), + name_func=_param_name_func, + ) + def test_target_last_layer(self, config0, config1): + # We are targeting the last layer of the SimpleNet. Therefore, since the adapters only add their activations + # to the output, the results should be commutative. This would *not* work if the adapters do something more + # complex or if we target an earlier layer, because of the non-linearity would destroy the commutativity. + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + + self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True) + self._check_merging(SimpleNet, config0, config1, input) + self._check_unload(SimpleNet, config0, config1, input) + self._check_disable(SimpleNet, config1, config0, input) + self._check_loading(SimpleNet, config0, config1, input, is_commutative=True) + + @parameterized.expand( + itertools.combinations( + [ + LoraConfig(init_lora_weights=False), + LoHaConfig(init_weights=False), + LoKrConfig(init_weights=False), + AdaLoraConfig(init_lora_weights=False, total_step=1), + ], + r=2, + ), + name_func=_param_name_func, + ) + def test_target_different_layers(self, config0, config1): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + + config0.target_modules = ["lin0"] + config1.target_modules = ["lin1"] + self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False) + self._check_merging(SimpleNet, config0, config1, input) + self._check_unload(SimpleNet, config0, config1, input) + self._check_disable(SimpleNet, config0, config1, input) + self._check_loading(SimpleNet, config0, config1, input, is_commutative=False) + + # same, but switch target_modules around + config0.target_modules = ["lin1"] + config1.target_modules = ["lin0"] + self._check_mixed_outputs(SimpleNet, config1, config0, input, is_commutative=False) + self._check_merging(SimpleNet, config1, config0, input) + self._check_unload(SimpleNet, config1, config0, input) + self._check_disable(SimpleNet, config1, config0, input) + self._check_loading(SimpleNet, config1, config0, input, is_commutative=False) + + @parameterized.expand( + [ + ( + LoraConfig(target_modules=["lin1"], init_lora_weights=False), + LoraConfig(target_modules=["lin1"], init_lora_weights=False), + ), + ( + LoHaConfig(target_modules=["lin1"], init_weights=False), + LoHaConfig(target_modules=["lin1"], init_weights=False), + ), + ( + LoKrConfig(target_modules=["lin1"], init_weights=False), + LoKrConfig(target_modules=["lin1"], init_weights=False), + ), + ( + AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False, total_step=1), + AdaLoraConfig(target_modules=["lin1"], init_lora_weights=False, total_step=1), + ), + ], + name_func=_param_name_func, + ) + def test_target_last_layer_same_type(self, config0, config1): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + + self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=True) + self._check_merging(SimpleNet, config0, config1, input) + self._check_unload(SimpleNet, config0, config1, input) + self._check_disable(SimpleNet, config1, config0, input) + + @parameterized.expand( + [ + ( + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + LoraConfig(target_modules=["lin0"], init_lora_weights=False), + ), + ( + LoHaConfig(target_modules=["lin0"], init_weights=False), + LoHaConfig(target_modules=["lin0"], init_weights=False), + ), + ( + LoKrConfig(target_modules=["lin0"], init_weights=False), + LoKrConfig(target_modules=["lin0"], init_weights=False), + ), + ( + AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), + AdaLoraConfig(target_modules=["lin0"], init_lora_weights=False, total_step=1), + ), + ], + name_func=_param_name_func, + ) + def test_target_first_layer_same_type(self, config0, config1): + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + self._check_mixed_outputs(SimpleNet, config0, config1, input, is_commutative=False) + self._check_merging(SimpleNet, config0, config1, input) + self._check_unload(SimpleNet, config0, config1, input) + self._check_disable(SimpleNet, config1, config0, input) + self._check_loading(SimpleNet, config0, config1, input, is_commutative=False) + + def test_deeply_nested(self): + # a somewhat absurdly nested model using different adapter types + if platform.system() == "Linux": + self.skipTest("This test fails but only on GitHub CI with Linux systems.") + + atol = 1e-5 + rtol = 1e-5 + torch.manual_seed(0) + + model = SimpleNet().eval().to(self.torch_device) + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + output_base = model(input) + + config0 = LoraConfig(r=4, lora_alpha=4, target_modules=["lin0", "lin1"], init_lora_weights=False) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + + config1 = LoHaConfig(r=4, alpha=4, target_modules=["lin0"], init_weights=False) + peft_model.add_adapter("adapter1", config1) + + config2 = AdaLoraConfig(r=4, lora_alpha=4, target_modules=["lin1"], init_lora_weights=False, total_step=1) + peft_model.add_adapter("adapter2", config2) + + config3 = LoKrConfig(r=4, alpha=4, target_modules=["lin0", "lin1"], init_weights=False) + peft_model.add_adapter("adapter3", config3) + + peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"]) + output_mixed = peft_model(input) + assert torch.isfinite(output_base).all() + assert not torch.allclose(output_base, output_mixed, atol=atol, rtol=rtol) + + # test disabling all adapters + with peft_model.disable_adapter(): + output_disabled = peft_model(input) + assert torch.isfinite(output_disabled).all() + assert torch.allclose(output_base, output_disabled, atol=atol, rtol=rtol) + assert not torch.allclose(output_mixed, output_disabled, atol=atol, rtol=rtol) + + # merge and unload all adapters + model_copy = copy.deepcopy(peft_model) + model = model_copy.merge_and_unload() + output_merged = model(input) + assert torch.isfinite(output_merged).all() + assert torch.allclose(output_mixed, output_merged, atol=atol, rtol=rtol) + + # merge and unload only adapter1 and adapter3 + model_copy = copy.deepcopy(peft_model) + model_copy.set_adapter(["adapter1", "adapter3"]) + output_13 = model_copy(input) + assert torch.isfinite(output_13).all() + assert not torch.allclose(output_mixed, output_13, atol=atol, rtol=rtol) + + model_copy.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"]) + model_merged_unloaded = model_copy.merge_and_unload(adapter_names=["adapter1", "adapter3"]) + output_merged_13 = model_merged_unloaded(input) + assert torch.isfinite(output_merged_13).all() + assert torch.allclose(output_13, output_merged_13, atol=atol, rtol=rtol) + + # test unloading + model_copy = copy.deepcopy(peft_model) + model_unloaded = model_copy.unload() + output_unloaded = model_unloaded(input) + assert torch.isfinite(output_unloaded).all() + assert torch.allclose(output_base, output_unloaded, atol=atol, rtol=rtol) + + def test_delete_adapter(self): + atol = 1e-5 + rtol = 1e-5 + torch.manual_seed(0) + + model = SimpleNet().eval().to(self.torch_device) + input = torch.arange(90).reshape(9, 10).to(self.torch_device) + output_base = model(input) + + # create adapter0 + torch.manual_seed(0) + config0 = LoraConfig(r=4, lora_alpha=4, target_modules=["lin0", "lin1"], init_lora_weights=False) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + output_0 = peft_model(input) + assert not torch.allclose(output_base, output_0, atol=atol, rtol=rtol) + + # add adapter1 + torch.manual_seed(1) + config1 = LoHaConfig(r=4, alpha=4, target_modules=["lin0"], init_weights=False) + peft_model.add_adapter("adapter1", config1) + peft_model.set_adapter(["adapter0", "adapter1"]) + output_01 = peft_model(input) + assert not torch.allclose(output_base, output_01, atol=atol, rtol=rtol) + assert not torch.allclose(output_0, output_01, atol=atol, rtol=rtol) + + # delete adapter1 + peft_model.delete_adapter("adapter1") + assert peft_model.active_adapters == ["adapter0"] + output_deleted_1 = peft_model(input) + assert torch.allclose(output_0, output_deleted_1, atol=atol, rtol=rtol) + + msg = re.escape("Adapter(s) ['adapter1'] not found, available adapters: ['adapter0']") + with pytest.raises(ValueError, match=msg): + peft_model.set_adapter(["adapter0", "adapter1"]) + + # re-add adapter1 + torch.manual_seed(1) + peft_model.add_adapter("adapter1", config1) + peft_model.set_adapter(["adapter0", "adapter1"]) + output_01_readded = peft_model(input) + assert not torch.allclose(output_base, output_01_readded, atol=atol, rtol=rtol) + + # same as above, but this time delete adapter0 first + torch.manual_seed(0) + model = SimpleNet().eval().to(self.torch_device) + torch.manual_seed(0) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + torch.manual_seed(1) + peft_model.add_adapter("adapter1", config1) + peft_model.delete_adapter("adapter0") + assert peft_model.active_adapters == ["adapter1"] + output_deleted_0 = peft_model(input) + assert not torch.allclose(output_deleted_0, output_base, atol=atol, rtol=rtol) + assert not torch.allclose(output_deleted_0, output_01, atol=atol, rtol=rtol) + + msg = re.escape("Adapter(s) ['adapter0'] not found, available adapters: ['adapter1']") + with pytest.raises(ValueError, match=msg): + peft_model.set_adapter(["adapter0", "adapter1"]) + + peft_model.delete_adapter("adapter1") + assert peft_model.active_adapters == [] + output_deleted_01 = peft_model(input) + assert torch.allclose(output_deleted_01, output_base, atol=atol, rtol=rtol) + + def test_modules_to_save(self): + model = SimpleNet().eval().to(self.torch_device) + config0 = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + + # adding a second adapter with same modules_to_save is not allowed + # TODO: theoretically, we could allow this if it's the same target layer + config1 = LoHaConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + peft_model.add_adapter("adapter1", config1) + with pytest.raises(ValueError, match="Only one adapter can be set at a time for ModulesToSaveWrapper"): + peft_model.set_adapter(["adapter0", "adapter1"]) + + def test_get_nb_trainable_parameters(self): + model = SimpleNet().eval().to(self.torch_device) + params_base = sum(p.numel() for p in model.parameters()) + + config0 = LoraConfig(target_modules=["lin0"]) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + trainable_params0, all_param0 = peft_model.get_nb_trainable_parameters() + + params_lora = sum(p.numel() for n, p in model.named_parameters() if "adapter0" in n) + assert trainable_params0 == params_lora + assert all_param0 == (params_base + params_lora) + + config1 = LoHaConfig(target_modules=["lin1"]) + peft_model.add_adapter("adapter1", config1) + peft_model.set_adapter(["adapter0", "adapter1"]) + params_loha = sum(p.numel() for n, p in model.named_parameters() if "adapter1" in n) + trainable_params1, all_param1 = peft_model.get_nb_trainable_parameters() + assert trainable_params1 == (params_lora + params_loha) + assert all_param1 == ((params_base + params_lora) + params_loha) + + config2 = AdaLoraConfig(target_modules=["lin0", "lin1"], total_step=1) + peft_model.add_adapter("adapter2", config2) + peft_model.set_adapter(["adapter0", "adapter1", "adapter2"]) + params_adalora = sum(p.numel() for n, p in model.named_parameters() if "adapter2" in n) + trainable_params2, all_param2 = peft_model.get_nb_trainable_parameters() + # remove 2 params because we need to exclude "ranknum" for AdaLora trainable params + assert trainable_params2 == (((params_lora + params_loha) + params_adalora) - 2) + assert all_param2 == (((params_base + params_lora) + params_loha) + params_adalora) + + def test_incompatible_config_raises(self): + model = SimpleNet().eval().to(self.torch_device) + config0 = LoraConfig(target_modules=["lin0"]) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + + config1 = PrefixTuningConfig() + msg = "The provided `peft_type` 'PREFIX_TUNING' is not compatible with the `PeftMixedModel`." + with pytest.raises(ValueError, match=msg): + peft_model.add_adapter("adapter1", config1) + + def test_decoder_model(self): + # test a somewhat realistic model instead of a toy model + torch.manual_seed(0) + + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device) + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + input_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + output_base = model.generate(**input_dict) + + torch.manual_seed(0) + config0 = LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False) + peft_model = get_peft_model(model, config0, "adapter0", mixed=True) + output0 = peft_model.generate(**input_dict) + assert torch.isfinite(output0).all() + assert not torch.allclose(output_base, output0) + + torch.manual_seed(1) + config1 = LoHaConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_weights=False) + peft_model.add_adapter("adapter1", config1) + peft_model.set_adapter(["adapter0", "adapter1"]) + output1 = peft_model.generate(**input_dict) + assert torch.isfinite(output1).all() + assert not torch.allclose(output0, output1) + + torch.manual_seed(2) + config2 = AdaLoraConfig(task_type="CAUSAL_LM", init_lora_weights=False, total_step=1) + peft_model.add_adapter("adapter2", config2) + peft_model.set_adapter(["adapter0", "adapter1", "adapter2"]) + output2 = peft_model.generate(**input_dict) + assert torch.isfinite(output2).all() + assert not torch.allclose(output1, output2) + + torch.manual_seed(3) + config3 = LoKrConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_weights=False) + peft_model.add_adapter("adapter3", config3) + peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"]) + output3 = peft_model.generate(**input_dict) + assert torch.isfinite(output3).all() + assert not torch.allclose(output2, output3) + + torch.manual_seed(4) + peft_model.set_adapter(["adapter0", "adapter1", "adapter2", "adapter3"]) + + with peft_model.disable_adapter(): + output_disabled = peft_model.generate(**input_dict) + assert torch.isfinite(output_disabled).all() + assert torch.allclose(output_base, output_disabled) + + model_unloaded = peft_model.merge_and_unload() + output_unloaded = model_unloaded.generate(**input_dict) + assert torch.isfinite(output_unloaded).all() + + with tempfile.TemporaryDirectory() as tmp_dir: + # save adapter0 (use normal PeftModel, because PeftMixedModel does not support saving) + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device) + torch.manual_seed(0) + peft_model = get_peft_model(model, config0, "adapter0") + output0_save = peft_model(**input_dict).logits + assert torch.isfinite(output0_save).all() + peft_model.save_pretrained(tmp_dir) + + # save adapter1 + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device) + torch.manual_seed(1) + peft_model = get_peft_model(model, config1, "adapter1") + output1_save = peft_model(**input_dict).logits + assert torch.isfinite(output1_save).all() + peft_model.save_pretrained(tmp_dir) + + # load adapter0 and adapter1 + model = AutoModelForCausalLM.from_pretrained(model_id).eval().to(self.torch_device) + peft_model = PeftMixedModel.from_pretrained(model, os.path.join(tmp_dir, "adapter0"), "adapter0") + peft_model.load_adapter(os.path.join(tmp_dir, "adapter1"), "adapter1") + peft_model.set_adapter(["adapter0", "adapter1"]) + output01_loaded = peft_model(**input_dict).logits + + atol, rtol = 1e-3, 1e-3 + assert torch.isfinite(output01_loaded).all() + assert not torch.allclose(output0_save, output01_loaded, atol=atol, rtol=rtol) + assert not torch.allclose(output1_save, output01_loaded, atol=atol, rtol=rtol) diff --git a/peft/tests/test_multitask_prompt_tuning.py b/peft/tests/test_multitask_prompt_tuning.py new file mode 100644 index 0000000000000000000000000000000000000000..94a9e213834407670a3916ca4710ce0cb0f15994 --- /dev/null +++ b/peft/tests/test_multitask_prompt_tuning.py @@ -0,0 +1,288 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile + +import pytest +import torch +from torch.testing import assert_close +from transformers import AutoModelForCausalLM + +from peft import get_peft_model +from peft.peft_model import PeftModel +from peft.tuners.multitask_prompt_tuning import MultitaskPromptTuningConfig, MultitaskPromptTuningInit +from peft.utils import infer_device +from peft.utils.other import WEIGHTS_NAME, prepare_model_for_kbit_training +from peft.utils.save_and_load import get_peft_model_state_dict + + +MODELS_TO_TEST = [ + "trl-internal-testing/tiny-random-LlamaForCausalLM", +] + + +class TestMultiTaskPromptTuning: + """ + Tests for the MultiTaskPromptTuning model. + """ + + @pytest.fixture + def config(cls) -> MultitaskPromptTuningConfig: + return MultitaskPromptTuningConfig( + task_type="CAUSAL_LM", + num_virtual_tokens=50, + num_tasks=3, + prompt_tuning_init_text=( + "classify the following into either positive or negative, or entailment, neutral or contradiction:" + ), + ) + + transformers_class = AutoModelForCausalLM + torch_device = infer_device() + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_prepare_for_training(self, model_id, config): + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device) + dummy_output = model.get_input_embeddings()(dummy_input) + + assert not dummy_output.requires_grad + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_prepare_for_int8_training(self, model_id, config): + model = AutoModelForCausalLM.from_pretrained(model_id) + model = prepare_model_for_kbit_training(model) + model = model.to(self.torch_device) + + for param in model.parameters(): + assert not param.requires_grad + + model = get_peft_model(model, config) + + # For backward compatibility + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + dummy_input = torch.LongTensor([[1, 1, 1]]).to(self.torch_device) + dummy_output = model.get_input_embeddings()(dummy_input) + + assert dummy_output.requires_grad + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_save_pretrained(self, model_id, config): + seed = 420 + torch.manual_seed(seed) + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + torch.manual_seed(seed) + model_from_pretrained = AutoModelForCausalLM.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + # check if the state dicts are equal + state_dict = get_peft_model_state_dict(model) + + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate). + assert len(state_dict) == 3 + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + # check if `adapter_model.safetensors` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_model.safetensors")) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `pytorch_model.bin` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_save_pretrained_regression(self, model_id, config): + seed = 420 + torch.manual_seed(seed) + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname, safe_serialization=False) + + torch.manual_seed(seed) + model_from_pretrained = AutoModelForCausalLM.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + # check if the state dicts are equal + state_dict = get_peft_model_state_dict(model) + + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # Check that the number of saved parameters is 4 -- 2 layers of (tokens and gate). + assert len(state_dict) == 3 + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + # check if `adapter_model.bin` is present for regression + assert os.path.exists(os.path.join(tmp_dirname, "adapter_model.bin")) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `pytorch_model.bin` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "pytorch_model.bin")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_generate(self, model_id, config): + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + task_ids = torch.LongTensor([1, 2]).to(self.torch_device) + + # check if `generate` works + _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, task_ids=task_ids) + + # check if `generate` works if positional arguments are passed + _ = model.generate(input_ids, attention_mask=attention_mask, task_ids=task_ids) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_use_cache(self, model_id, config): + """Test that MultiTaskPromptTuning works when Llama config use_cache=True.""" + torch.manual_seed(0) + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + task_ids = torch.LongTensor([1, 2]).to(self.torch_device) + + original = AutoModelForCausalLM.from_pretrained(model_id) + mpt = get_peft_model(original, config) + mpt = mpt.to(self.torch_device) + + expected = mpt.generate(input_ids=input_ids, max_length=8, task_ids=task_ids) + + # Set use_cache = True and generate output again. + mpt.base_model.config.use_cache = True + actual = mpt.generate(input_ids=input_ids, max_length=8, task_ids=task_ids) + assert_close(expected, actual, rtol=0, atol=0) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_bf16_inference(self, model_id, config): + """Test that MultiTaskPromptTuning works when Llama using a half-precision model.""" + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + task_ids = torch.tensor([1, 2]).to(self.torch_device) + + original = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + mpt = get_peft_model(original, config) + mpt = mpt.to(self.torch_device) + _ = mpt.generate(input_ids=input_ids, task_ids=task_ids) + + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_generate_text_with_random_init(self, model_id, config) -> None: + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(model_id) + config.prompt_tuning_init = MultitaskPromptTuningInit.RANDOM + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + task_ids = torch.LongTensor([0]).to(self.torch_device) + + # check if `generate` works + _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, task_ids=task_ids) + + with pytest.raises(ValueError): + # check if `generate` raises an error if task_ids are not passed + _ = model.generate(input_ids, attention_mask=attention_mask) + + @pytest.mark.parametrize( + "prompt_tuning_init", + [ + MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS, + MultitaskPromptTuningInit.EXACT_SOURCE_TASK, + MultitaskPromptTuningInit.ONLY_SOURCE_SHARED, + ], + ) + @pytest.mark.parametrize("model_id", MODELS_TO_TEST) + def test_generate_text_with_other_init(self, prompt_tuning_init, model_id, config) -> None: + # This test is flaky, hence fixing the seed. The reason is somehow related to: + # https://github.com/huggingface/transformers/blob/e786844425b6b1112c76513d66217ce2fe6aea41/src/transformers/generation/utils.py#L2691 + # When an EOS token is generated, the loop is exited and the pytest.raises at the bottom is not triggered + # because `forward` of the PEFT model, which should raise the error, is never called. + torch.manual_seed(42) # seed 43 fails with transformers v4.42.3 and torch v2.3.1 + + with tempfile.TemporaryDirectory() as tmp_dirname: + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model.save_pretrained(tmp_dirname, safe_serialization=False) # bc torch.load is used + + config = MultitaskPromptTuningConfig( + task_type="CAUSAL_LM", + num_virtual_tokens=50, + num_tasks=1, + prompt_tuning_init_text=( + "classify the following into either positive or negative, or entailment, neutral or contradiction:" + ), + prompt_tuning_init=prompt_tuning_init, + prompt_tuning_init_state_dict_path=os.path.join(tmp_dirname, WEIGHTS_NAME), + ) + model = AutoModelForCausalLM.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + task_ids = torch.LongTensor([0]).to(self.torch_device) + + # check if `generate` works + _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, task_ids=task_ids) + + with pytest.raises(ValueError, match="task_ids cannot be None"): + # check if `generate` raises an error if task_ids are not passed + _ = model.generate(input_ids, attention_mask=attention_mask) diff --git a/peft/tests/test_other.py b/peft/tests/test_other.py new file mode 100644 index 0000000000000000000000000000000000000000..7a0ca34589bc44ff65c296e48a75ca8590355b9c --- /dev/null +++ b/peft/tests/test_other.py @@ -0,0 +1,532 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +import pytest +import torch +from torch import nn +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, LlavaForConditionalGeneration + +from peft import LoraConfig, PeftModel, VeraConfig, get_peft_model +from peft.utils.other import ModulesToSaveWrapper, _get_no_split_modules + + +class ModelWithModuleDict(nn.Module): + def __init__(self): + super().__init__() + self.other_layer = nn.Linear(10, 10) + self.module = nn.ModuleDict({"foo": nn.Linear(10, 10)}) + + def forward(self): + return self.module["foo"](torch.rand(1, 10)) + + +class ModelWithModuleList(nn.Module): + def __init__(self): + super().__init__() + self.other_layer = nn.Linear(10, 10) + self.module = nn.ModuleList([nn.Linear(10, 10)]) + + def forward(self): + return self.module[0](torch.rand(1, 10)) + + +class ModelWithParameterDict(nn.Module): + def __init__(self): + super().__init__() + self.other_layer = nn.Linear(10, 10) + self.module = nn.ParameterDict({"foo": nn.Parameter(torch.rand(10, 10))}) + + def forward(self): + return self.module["foo"] + + +class ModelWithParameterList(nn.Module): + def __init__(self): + super().__init__() + self.other_layer = nn.Linear(10, 10) + self.module = nn.ParameterList([nn.Parameter(torch.rand(10, 10))]) + + def forward(self): + return self.module[0] + + +@pytest.mark.parametrize( + "cls", [ModelWithModuleDict, ModelWithModuleList, ModelWithParameterDict, ModelWithParameterList] +) +def test_modules_to_save_targets_module_dict_raises(cls): + model = cls() + peft_config = LoraConfig( + target_modules=["other_layer"], + modules_to_save=["module"], + ) + model() # sanity check that the model would normally work + + msg = "modules_to_save cannot be applied to modules of type" + with pytest.raises(TypeError, match=msg): + get_peft_model(model=model, peft_config=peft_config) + + +def test_get_peft_model_revision_warning(tmp_path): + base_model_id = "peft-internal-testing/tiny-random-BertModel" + base_revision = "v2.0.0" + base_model = AutoModelForCausalLM.from_pretrained(base_model_id, revision=base_revision).eval() + lora_config = LoraConfig(revision=base_revision) + + overwrite_revision = "main" + overwrite_warning = f"peft config has already set base model revision to {base_revision}, overwriting with revision {overwrite_revision}" + with pytest.warns(UserWarning, match=overwrite_warning): + _ = get_peft_model(base_model, lora_config, revision=overwrite_revision) + + +def test_load_multiple_adapters_different_modules_to_save(tmp_path): + # This tests the error described in #2422 where loading multiple adapters with different modules_to_save + # attributes fails (due to a regression from #2376). + + model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-random-LlamaForCausalLM") + + def peft_config(**kwargs): + return LoraConfig(target_modules="all-linear", **kwargs) + + original_model = copy.deepcopy(model) + + peft_config_0 = peft_config(modules_to_save=["0.post_attention_layernorm"]) + peft_config_1 = peft_config(modules_to_save=["0.post_attention_layernorm"]) + peft_config_2 = peft_config(modules_to_save=["1.post_attention_layernorm"]) + + # Save adapter 0, nothing fancy, should be equal to base model weighs + peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_0) + peft_model.save_pretrained(tmp_path / "adapter_0") + + # Save adapter 1, modules to save weights are modified randomly, should be unique to adapter 1 + peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_1) + peft_model.model.model.layers[0].post_attention_layernorm.weight.data = torch.rand_like( + peft_model.model.model.layers[0].post_attention_layernorm.weight.data + ) + adapter_1_saved = peft_model.model.model.layers[0].post_attention_layernorm.weight.data.clone() + peft_model.save_pretrained(tmp_path / "adapter_1") + + # Save adapter 2, modules to save weights are modified randomly, should be unique to adapter 2 + peft_model = get_peft_model(copy.deepcopy(original_model), peft_config_2) + peft_model.model.model.layers[1].post_attention_layernorm.weight.data = torch.rand_like( + peft_model.model.model.layers[1].post_attention_layernorm.weight.data + ) + adapter_2_saved = peft_model.model.model.layers[1].post_attention_layernorm.weight.data.clone() + peft_model.save_pretrained(tmp_path / "adapter_2") + + del peft_model + + combined_model = PeftModel.from_pretrained(original_model, tmp_path / "adapter_0", adapter_name="adapter_0") + combined_model.load_adapter(tmp_path / "adapter_1", adapter_name="adapter_1") + combined_model.load_adapter(tmp_path / "adapter_2", adapter_name="adapter_2") + + # For adapter 0 we expect every mentioned modules to save layer of this test to be equal to the original model + # since we didn't modify it for adapter 0 and only adapter 0 is active. + combined_model.set_adapter("adapter_0") + assert torch.allclose( + combined_model.model.model.layers[0].post_attention_layernorm.weight, + original_model.model.layers[0].post_attention_layernorm.weight, + ) + assert torch.allclose( + combined_model.model.model.layers[1].post_attention_layernorm.weight, + original_model.model.layers[1].post_attention_layernorm.weight, + ) + + # For adapter 1 we expect that the modified module to save 0.post_attention_layernorm is modified, the other + # module to save layers mentioned above should be untouched. + combined_model.set_adapter("adapter_1") + assert torch.allclose( + combined_model.model.model.layers[0].post_attention_layernorm.weight, + adapter_1_saved, + ) + assert torch.allclose( + combined_model.model.model.layers[1].post_attention_layernorm.weight, + original_model.model.layers[1].post_attention_layernorm.weight, + ) + + # For adapter 2 we expect its module to save layer (1.post_attention_layernorm) to be modified but the other + # module to save weights should be kept original. + combined_model.set_adapter("adapter_2") + assert torch.allclose( + combined_model.model.model.layers[0].post_attention_layernorm.weight, + original_model.model.layers[0].post_attention_layernorm.weight, + ) + assert torch.allclose( + combined_model.model.model.layers[1].post_attention_layernorm.weight, + adapter_2_saved, + ) + + +class TestModulesToSaveAttributeAccess: + """Test attribute access on the ModulesToSaveWrapper class. + + When we have modules_to_save, the original module is wrapped. As long as only forward was called on this wrapped + module, we were good. However, if, for instance, model parameters were directly accessed by another module, this + would typically fail, as the wrapper does not have this attribute. We had special properties for weight and bias, + but this is not enough. Therefore, attribute access is now transiently delegated to the active adapter (or original + module, if the adapter is disabled). + + For one example, see #2099. + + """ + + @pytest.fixture + def mlp(self): + class MLP(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(1, 2) + self.lin1 = nn.Linear(3, 4) + + return MLP() + + def test_transient_attribute_access_default_adapter(self, mlp): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + assert model.lin1.weight is model.lin1.modules_to_save["default"].weight + assert model.lin1.bias is model.lin1.modules_to_save["default"].bias + + def test_transient_attribute_access_non_default_adapter(self, mlp): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + model.add_adapter("other", config) + + # at this point, default is still active + assert model.lin1.weight is model.lin1.modules_to_save["default"].weight + assert model.lin1.bias is model.lin1.modules_to_save["default"].bias + assert model.lin1.weight is not model.lin1.modules_to_save["other"].weight + assert model.lin1.bias is not model.lin1.modules_to_save["other"].bias + + model.set_adapter("other") + assert model.lin1.weight is not model.lin1.modules_to_save["default"].weight + assert model.lin1.bias is not model.lin1.modules_to_save["default"].bias + assert model.lin1.weight is model.lin1.modules_to_save["other"].weight + assert model.lin1.bias is model.lin1.modules_to_save["other"].bias + + def test_transient_attribute_access_disabled_adapter(self, mlp): + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + + # at this point, default is still active + assert model.lin1.weight is model.lin1.modules_to_save["default"].weight + assert model.lin1.bias is model.lin1.modules_to_save["default"].bias + assert model.lin1.weight is not model.lin1.original_module.weight + assert model.lin1.bias is not model.lin1.original_module.bias + + with model.disable_adapter(): + assert model.lin1.weight is not model.lin1.modules_to_save["default"].weight + assert model.lin1.bias is not model.lin1.modules_to_save["default"].bias + assert model.lin1.weight is model.lin1.original_module.weight + assert model.lin1.bias is model.lin1.original_module.bias + + def test_transient_attribute_access_uninitialized_adapter(self, mlp): + # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the class itself + with pytest.raises(AttributeError, match="has no attribute 'original_module'"): + ModulesToSaveWrapper.original_module + + def test_transient_attribute_access_attr_does_not_exist_on_modules_to_save(self, mlp): + # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the + # ModelToSaveWrapper instance + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + + with pytest.raises(AttributeError, match="has no attribute 'foo'"): + model.lin1.foo + + def test_transient_attribute_access_attr_does_not_exist_on_original_module(self, mlp): + # ensure that there is no weird infinite recursion when accessing a non-existing attribute on the + # original module of the ModelToSaveWrapper instance + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + + with pytest.raises(AttributeError, match="has no attribute 'foo'"): + with model.disable_adapter(): + model.lin1.foo + + def test_transient_attribute_access_non_existing_adapter(self, mlp): + # This should normally never happen, as the active adapter should always exist, but it's a failsafe + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(mlp, config) + model.base_model.model.lin1._active_adapter = "does-not-exist" + with pytest.raises(AttributeError, match="has no attribute 'weight'"): + model.lin1.weight + + +class TestModulesToSaveNameSubstringBug: + """Test a bug that could occur with multiple modules to save where one adapter's name is a substring of another + adapter's name. + + This bug was the result of an error in the logic of modifying the state_dict for modules_to_save in + set_peft_model_state_dict. The error in the logic was that it was checked if an entry from modules_to_save (a set + of strings) is a substring of a key of the state_dict. If it was, a new name was assigned to that key in the + state_dict, which would allow to load the weight later. + + The issue that stems from the substring check occurs if there are multiple modules_to_save, and one of them has a + name that is a substring of another. So e.g. if one is named "classifier" and the other is named "classifier2", + there could be a false match. + + + This bug was reported in #2289. + + """ + + def get_model(self): + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.lin = nn.Linear(5, 4) + # important: "classifier" is a substring of "classifier2", "classifier3", "classifier4" + self.classifier = nn.Linear(4, 2) + self.classifier2 = nn.Linear(4, 2) + self.classifier3 = nn.Linear(4, 2) + self.classifier4 = nn.Linear(4, 2) + + def forward(self, x): + x = self.lin(x) + return self.classifier(x) + self.classifier2(x) + self.classifier3(x) + self.classifier4(x) + + torch.manual_seed(0) + return MyModule() + + @pytest.fixture + def path_merged_and_unmerged(self, tmp_path): + # Create 2 checkpoints: + # 1. merged: the model after calling merge_and_unload + # 2. unmerged: the PEFT model saved without calling merge_and_unload + path = tmp_path / "model.pt" + + lora_config = LoraConfig( + target_modules=["lin"], + # important: "classifier" is a substring of "classifier2", "classifier3", "classifier4" + modules_to_save=["classifier", "classifier2", "classifier3", "classifier4"], + ) + model = get_peft_model(self.get_model(), lora_config) + # mock training + for _ in range(5): + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + output = model(torch.randn(10, 5)) + loss = output.sum() + loss.backward() + optimizer.step() + + # save the peft model without merging + path_unmerged = tmp_path / "unmerged" + model.save_pretrained(path_unmerged) + + # merge the model and save state_dict + path_merged = tmp_path / "merged" + merged = model.merge_and_unload() + state_dict = merged.state_dict() + torch.save(state_dict, path_merged) + + return path_merged, path_unmerged + + def test_load_merged_and_unmerged_same_weights(self, path_merged_and_unmerged): + # Note that this test is quasi flaky, it has a 1 in 4 chance of passing even without the bugfix. It passes when + # "classifier" happens to be the last element of the set model.modules_to_save. The order of the set is random. + # It is not possible just run this test multiple times to minimize the probability of this happening, because + # within the same process, the hash order is consistent. With the bug fix, this doesn't matter, as the test will + # always pass, but if there is a regression, there is a 1 in 4 chance of not catching it. Since the CI runs many + # tests, it is overall very unlikely that none will catch it though. If you see this test failing in CI, thus be + # aware that some of the passing tests may just pass owing to randomness. + path_merged, path_unmerged = path_merged_and_unmerged + + # load the merged model directly + state_dict = torch.load(path_merged, weights_only=True) + model = self.get_model() + model.load_state_dict(state_dict) + sd_merged = model.state_dict() + del model + + # load the unmerged model and merge it + unmerged = PeftModel.from_pretrained(self.get_model(), path_unmerged) + sd_unmerged = unmerged.merge_and_unload().state_dict() + + assert sd_merged.keys() == sd_unmerged.keys() + for key in sd_merged.keys(): + param_merged = sd_merged[key] + param_unmerged = sd_unmerged[key] + assert torch.allclose(param_merged, param_unmerged) + + +class TestTargetingAuxiliaryTrainingWrapper: + """AuxiliaryTrainingWrapper such as ModulesToSaveWrapper and TrainableTokensWrapper are + in general not to be targeted by PEFT methods such as adapters. For example, a ModulesToSaveWrapper's children + modules should not be targeted by `LoraConfig(target_modules='all-linear')`, among other things. + """ + + @pytest.fixture + def plain_model_cls(self): + class PlainModel(nn.Module): + def __init__(self, i, o): + super().__init__() + self.layer1 = nn.Linear(i, o) + + def forward(self, x): + return self.layer1(x) + + return PlainModel + + @pytest.fixture + def nested_model_cls(self, plain_model_cls): + class NestedModel(nn.Module): + def __init__(self): + super().__init__() + self.layer1 = nn.Linear(10, 20) + self.layer2 = nn.Linear(20, 5) + self.layer3 = plain_model_cls(5, 10) + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + return x + + return NestedModel + + def test_nested_ignores_modules_to_save(self, nested_model_cls, plain_model_cls): + # Make sure that `target_modules` is not targeting the nested modules of a module marked as module to save. + model = nested_model_cls() + config = LoraConfig( + target_modules=["layer1"], + modules_to_save=["layer3"], + ) + + peft_model = get_peft_model(model, config) + assert isinstance(peft_model.model.layer3.modules_to_save.default, plain_model_cls) + + def test_targeting_module_to_save_raises(self, nested_model_cls): + model = nested_model_cls() + config = LoraConfig( + target_modules=["layer1"], + modules_to_save=["layer1"], + ) + msg = "No modules were targeted for adaptation. This might be caused by a combination" + with pytest.raises(ValueError, match=msg): + get_peft_model(model, config) + + def test_modules_to_save_targets_tuner_layer_raises(self): + # See e.g. issue 2027 and 2477 + # Prevent users from (accidentally) targeting the same layer both with a tuner and modules_to_save. Normally, PEFT + # will not target the same layer with both a tuner and ModulesToSaveWrapper. However, if modules_to_save is + # automatically inferred, e.g. when using AutoModelForSequenceClassification, the ModulesToSaveWrapper is applied ex + # post, which can lead to the double wrapping. + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForSequenceClassification.from_pretrained(model_id) + + # Note: target_modules="all-linear" would also work and is closer to the original issue, but let's explicitly target + # "score" here in case that "all-linear" will be fixed to no longer target the score layer. + peft_config = LoraConfig(target_modules=["score"], task_type="SEQ_CLS") + + # Since the `score` layer is in `model.modules_to_save` it should be ignored when targeted, + # therefore the layer should not be adapted. + msg = "No modules were targeted for adaptation. This might be caused by a combination" + with pytest.raises(ValueError, match=msg) as e: + get_peft_model(model, peft_config) + + def test_targeting_trainable_tokens_raises(self): + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + model = AutoModelForSequenceClassification.from_pretrained(model_id) + + peft_config = LoraConfig(target_modules=["embed_tokens"], task_type="SEQ_CLS", trainable_token_indices=[0, 1]) + + # While this message might not be the most helpful message, at least it is not silently failing + msg = "trainable_token_indices cannot be applied to modules of type " + with pytest.raises(TypeError, match=msg) as e: + get_peft_model(model, peft_config) + + +class TestAdapterTargeting: + """Make sure that already existing adapters cannot be targeted to avoid conflicts.""" + + @pytest.fixture + def base_model_cls(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.l1 = torch.nn.Linear(10, 20) + self.l2 = torch.nn.Conv2d(1, 1, 2) + + def forward(self, x): + return self.l2(self.l1(x)) + + return M + + @pytest.mark.parametrize( + "config_cls, config_kwargs", + [ + (LoraConfig, {"target_modules": "l1.*"}), + (LoraConfig, {"target_modules": "l2.*"}), + (VeraConfig, {"target_modules": "l1.*"}), + (VeraConfig, {"target_modules": "(l1|vera_A).*"}), # also target the shared layer + ], + ) + def test_self_targeting_is_ignored(self, base_model_cls, config_cls, config_kwargs): + base_model = base_model_cls() + config1 = config_cls(**config_kwargs) + config2 = config_cls(**config_kwargs) + + adapter1_name = "ADAPTER_1_512858" # sufficiently unique names to make reliable testing easier + adapter2_name = "ADAPTER_2_845781" + + peft_model = get_peft_model(base_model, config1, adapter_name=adapter1_name) + state_dict_keys_1 = peft_model.state_dict().keys() + + peft_model.add_adapter(adapter2_name, config2) + state_dict_keys_2 = peft_model.state_dict().keys() + + # Ideally there should be no new modules targeted beyond existing ModuleDicts. Therefore the keys + # of the new state dict should only differ after the adapter name portion of the keys - not before. + # Expected: + # - a.b..xyz + # - a.b..xyz + # We're not expecting this to happen and test against it: + # - a.b..xyz + # - a..xyz + def remove_adapter_portion(adapter_name, key): + if key.endswith(f".{adapter_name}"): + return key.removesuffix(f".{adapter_name}") + return key.split(f".{adapter_name}.")[0] + + adapter_invariant_keys1 = {remove_adapter_portion(adapter1_name, key) for key in state_dict_keys_1} + adapter_invariant_keys2 = { + remove_adapter_portion(adapter2_name, remove_adapter_portion(adapter1_name, key)) + for key in state_dict_keys_2 + } + + assert adapter_invariant_keys1 == adapter_invariant_keys2 + + +class TestGetNoSplitModules: + # Ensure that children are considered when determining _no_split_modules + # see https://github.com/huggingface/transformers/pull/38141 + + def test_get_no_split_modules_simple(self): + # choose a model where recursively visiting children is *not* required + model_id = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_id) + assert model._no_split_modules == ["OPTDecoderLayer"] + no_split_modules = _get_no_split_modules(model) + assert no_split_modules == {"OPTDecoderLayer"} + + def test_get_no_split_modules_recursive(self): + # choose a model where recursively visiting children is required + model_id = "hf-internal-testing/tiny-random-LlavaForConditionalGeneration" + model = LlavaForConditionalGeneration.from_pretrained(model_id) + # sanity check: just visiting the model itself is not enough: + assert model._no_split_modules == [] + + no_split_modules = _get_no_split_modules(model) + assert no_split_modules == {"CLIPEncoderLayer", "LlamaDecoderLayer"} diff --git a/peft/tests/test_poly.py b/peft/tests/test_poly.py new file mode 100644 index 0000000000000000000000000000000000000000..8e9a2a351c8b13fd08bd21001951c04875fb789f --- /dev/null +++ b/peft/tests/test_poly.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import torch +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + +from peft import PeftModel, PolyConfig, TaskType, get_peft_model + + +class TestPoly(unittest.TestCase): + def test_poly(self): + torch.manual_seed(0) + model_name_or_path = "google/flan-t5-small" + + atol, rtol = 1e-6, 1e-6 + r = 8 # rank of lora in poly + n_tasks = 3 # number of tasks + n_skills = 2 # number of skills (loras) + n_splits = 4 # number of heads + lr = 1e-2 + num_epochs = 10 + + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + + peft_config = PolyConfig( + task_type=TaskType.SEQ_2_SEQ_LM, + poly_type="poly", + r=r, + n_tasks=n_tasks, + n_skills=n_skills, + n_splits=n_splits, + ) + + model = get_peft_model(base_model, peft_config) + + # generate some dummy data + text = os.__doc__.splitlines() + assert len(text) > 10 + inputs = tokenizer(text, return_tensors="pt", padding=True) + inputs["task_ids"] = torch.arange(len(text)) % n_tasks + inputs["labels"] = tokenizer((["A", "B"] * 100)[: len(text)], return_tensors="pt")["input_ids"] + + # simple training loop + model.train() + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + losses = [] + for _ in range(num_epochs): + outputs = model(**inputs) + loss = outputs.loss + loss.backward() + optimizer.step() + optimizer.zero_grad() + losses.append(loss.item()) + + # loss improved by at least 50% + assert losses[-1] < (0.5 * losses[0]) + + # check that saving and loading works + torch.manual_seed(0) + model.eval() + logits_before = model(**inputs).logits + tokens_before = model.generate(**inputs) + + with model.disable_adapter(): + logits_disabled = model(**inputs).logits + tokens_disabled = model.generate(**inputs) + + assert not torch.allclose(logits_before, logits_disabled, atol=atol, rtol=rtol) + assert not torch.allclose(tokens_before, tokens_disabled, atol=atol, rtol=rtol) + + # saving and loading + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + loaded = PeftModel.from_pretrained(base_model, tmp_dir) + + torch.manual_seed(0) + output_after = loaded(**inputs).logits + tokens_after = loaded.generate(**inputs) + assert torch.allclose(logits_before, output_after, atol=atol, rtol=rtol) + assert torch.allclose(tokens_before, tokens_after, atol=atol, rtol=rtol) diff --git a/peft/tests/test_randlora.py b/peft/tests/test_randlora.py new file mode 100644 index 0000000000000000000000000000000000000000..5fb7edb6a5e7c1fac2e5b717fbd6ffffe2eb7154 --- /dev/null +++ b/peft/tests/test_randlora.py @@ -0,0 +1,301 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to RandLora, since Randlora has some specific challenges due to the shared weights. +# These tests are copied from the test_vera.py file + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from safetensors import safe_open +from torch import nn + +from peft import PeftModel, RandLoraConfig, get_peft_model + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +# Tests copied from the TestVera class in test_vera.py. +# Changes to the code file should be reflected here. +class TestRandLora: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + @pytest.fixture + def mlp_same_prng(self, mlp): + torch.manual_seed(0) + + config = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False) + # creates a default RandLora adapter + peft_model = get_peft_model(mlp, config) + config2 = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model.add_adapter("other", config2) + return peft_model + + def test_multiple_adapters_same_prng_weights(self, mlp_same_prng): + # we can have multiple adapters with the same prng key, in which case the weights should be shared + assert ( + mlp_same_prng.base_model.model.lin1.randlora_A["default"] + is mlp_same_prng.base_model.model.lin1.randlora_A["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_B["default"] + is mlp_same_prng.base_model.model.lin1.randlora_B["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin2.randlora_A["default"] + is mlp_same_prng.base_model.model.lin2.randlora_A["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin2.randlora_B["default"] + is mlp_same_prng.base_model.model.lin2.randlora_B["other"] + ) + + input = torch.randn(5, 10) + mlp_same_prng.set_adapter("default") + output_default = mlp_same_prng(input) + mlp_same_prng.set_adapter("other") + output_other = mlp_same_prng(input) + assert not torch.allclose(output_default, output_other, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_different_prng_raises(self): + # we cannot have multiple adapters with different prng keys + model = MLP() + config = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False) + # creates a default RandLora adapter + peft_model = get_peft_model(model, config) + config2 = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False, projection_prng_key=123) + + msg = ( + r"RandLora PRNG initialisation key must be the same for all adapters. Got config.projection_prng_key=123 but " + r"previous config had 0" + ) + with pytest.raises(ValueError, match=msg): + peft_model.add_adapter("other", config2) + + def test_multiple_adapters_save_load_save_projection_true(self, mlp_same_prng, tmp_path): + # check saving and loading works with multiple adapters and saved projection weights + torch.manual_seed(0) + input = torch.randn(5, 10) + mlp_same_prng.set_adapter("default") + output_default = mlp_same_prng(input) + mlp_same_prng.set_adapter("other") + output_other = mlp_same_prng(input) + + # sanity check + assert not torch.allclose(output_default, output_other, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "randlora" + mlp_same_prng.save_pretrained(save_path) + assert os.path.exists(save_path / "adapter_config.json") + assert os.path.exists(save_path / "other" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path) + peft_model.load_adapter(save_path / "other", "other") + + peft_model.set_adapter("default") + output_default_loaded = peft_model(input) + peft_model.set_adapter("other") + output_other_loaded = peft_model(input) + + assert torch.allclose(output_default, output_default_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_other, output_other_loaded, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): + # check saving and loading works with multiple adapters without saved projection weights + torch.manual_seed(1) + config = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + # creates a default RandLora adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + peft_model.set_adapter("second") + output_second = peft_model(input) + + # sanity check + assert not torch.allclose(output_first, output_second, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "randlora" + peft_model.save_pretrained(save_path) + assert os.path.exists(save_path / "first" / "adapter_config.json") + assert os.path.exists(save_path / "second" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path / "first", adapter_name="first") + peft_model.load_adapter(save_path / "second", "second") + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + peft_model.set_adapter("second") + output_second_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_second, output_second_loaded, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_save_projection_true_contains_randlora_A_randlora_B(self, mlp_same_prng, tmp_path): + # check that the state_dicts don't contain the projection weights + save_path = tmp_path / "randlora" + mlp_same_prng.save_pretrained(save_path) + + sd_default = {} + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_default[key] = f.get_tensor(key) + + assert any("randlora_A" in key for key in sd_default) + assert any("randlora_B" in key for key in sd_default) + # default rank for RandLora is 32 + assert sd_default["base_model.randlora_A"].shape == (32, 1, 20) + assert sd_default["base_model.randlora_B"].shape == (20, 1, 32) + + sd_other = {} + with safe_open(save_path / "other" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_other[key] = f.get_tensor(key) + + assert any("randlora_A" in key for key in sd_other) + assert any("randlora_B" in key for key in sd_other) + assert sd_other["base_model.randlora_A"].shape == (32, 1, 20) + assert sd_other["base_model.randlora_B"].shape == (20, 1, 32) + + def test_multiple_adapters_save_projection_false_contains_no_randlora_A_randlora_B(self, mlp, tmp_path): + torch.manual_seed(1) + config = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + # creates a default RandLora adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + + save_path = tmp_path / "randlora" + peft_model.save_pretrained(save_path) + + sd_default = {} + with safe_open(save_path / "first" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_default[key] = f.get_tensor(key) + + assert not any("randlora_A" in key for key in sd_default) + assert not any("randlora_B" in key for key in sd_default) + + sd_other = {} + with safe_open(save_path / "second" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_other[key] = f.get_tensor(key) + + assert not any("randlora_A" in key for key in sd_other) + assert not any("randlora_B" in key for key in sd_other) + + def test_randlora_A_randlora_B_share_memory(self, mlp_same_prng): + randlora_A = mlp_same_prng.randlora_A["default"] + randlora_B = mlp_same_prng.randlora_B["default"] + + # these tensors should share the same data + assert randlora_A.data_ptr() == mlp_same_prng.base_model.model.lin1.randlora_A["default"].data_ptr() + assert randlora_B.data_ptr() == mlp_same_prng.base_model.model.lin1.randlora_B["default"].data_ptr() + assert randlora_A.data_ptr() == mlp_same_prng.base_model.model.lin2.randlora_A["default"].data_ptr() + assert randlora_B.data_ptr() == mlp_same_prng.base_model.model.lin2.randlora_B["default"].data_ptr() + # sanity check: these tensors shouldn't share the same data + assert randlora_A.data_ptr() != randlora_B.data_ptr() + + def test_randlora_lambda_dont_share_memory(self, mlp_same_prng): + # sanity check: these tensors shouldn't share the same data + assert ( + mlp_same_prng.base_model.model.lin1.randlora_lambda["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.randlora_lambda["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_lambda["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.randlora_lambda["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_lambda["other"].data_ptr() + != mlp_same_prng.base_model.model.lin2.randlora_lambda["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_gamma["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.randlora_gamma["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_gamma["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.randlora_gamma["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.randlora_gamma["other"].data_ptr() + != mlp_same_prng.base_model.model.lin2.randlora_gamma["other"].data_ptr() + ) + + def test_randlora_different_shapes(self, mlp): + config = RandLoraConfig(target_modules=["lin0", "lin3"], init_weights=False) + mlp_different_shapes = get_peft_model(mlp, config) + + randlora_A = mlp_different_shapes.randlora_A["default"] + randlora_B = mlp_different_shapes.randlora_B["default"] + + # sanity check + assert mlp.lin0.base_layer.weight.shape != mlp.lin3.base_layer.weight.shape + + # lin0 has the largest output dimension, lin3 has the largest input dimension + # randlora_A should have the shape of (rank, largest_in), randlora_B should have the shape of (largest_out, rank) + assert randlora_A.shape == (config.r, 1, mlp.lin3.in_features) + assert randlora_B.shape == (mlp.lin0.out_features, 1, config.r) + + # should not raise + input = torch.randn(5, 10) + mlp_different_shapes(input) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_randlora_dtypes(self, dtype): + if dtype == torch.bfloat16: + # skip if bf16 is not supported on hardware, see #1872 + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + model = MLP().to(dtype) + config = RandLoraConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(model, config) + inputs = torch.randn(5, 10).to(dtype) + output = peft_model(inputs) # should not raise + assert output.dtype == dtype diff --git a/peft/tests/test_seq_classifier.py b/peft/tests/test_seq_classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..eb0a3d38a49c2b4309e2d9076e5b40943b470671 --- /dev/null +++ b/peft/tests/test_seq_classifier.py @@ -0,0 +1,306 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License governing permissions and limitations under the License. + +import pytest +import torch +from transformers import AutoModelForSequenceClassification + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + C3AConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LoraConfig, + MissConfig, + OFTConfig, + PrefixTuningConfig, + PromptEncoderConfig, + PromptTuningConfig, + PromptTuningInit, + RoadConfig, + ShiraConfig, + VBLoRAConfig, + VeraConfig, + WaveFTConfig, + get_peft_model, +) +from peft.utils.other import ModulesToSaveWrapper + +from .testing_common import PeftCommonTester +from .testing_utils import hub_online_once + + +PEFT_SEQ_CLS_MODELS_TO_TEST = [ + "hf-internal-testing/tiny-random-BertForSequenceClassification", + "hf-internal-testing/tiny-random-RobertaForSequenceClassification", + "trl-internal-testing/tiny-LlamaForSequenceClassification-3.2", +] + + +ALL_CONFIGS = [ + ( + AdaLoraConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "total_step": 1, + }, + ), + ( + BOFTConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + }, + ), + ( + BoneConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "r": 2, + }, + ), + ( + MissConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "r": 2, + }, + ), + ( + FourierFTConfig, + { + "task_type": "SEQ_CLS", + "n_frequency": 10, + "target_modules": None, + }, + ), + ( + HRAConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + }, + ), + ( + IA3Config, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "feedforward_modules": None, + }, + ), + ( + LoraConfig, + { + "task_type": "SEQ_CLS", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + }, + ), + # LoRA + trainable tokens + ( + LoraConfig, + { + "task_type": "SEQ_CLS", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "trainable_token_indices": [0, 1, 3], + }, + ), + ( + OFTConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + }, + ), + ( + PrefixTuningConfig, + { + "task_type": "SEQ_CLS", + "num_virtual_tokens": 10, + }, + ), + ( + PromptEncoderConfig, + { + "task_type": "SEQ_CLS", + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + }, + ), + ( + PromptTuningConfig, + { + "task_type": "SEQ_CLS", + "num_virtual_tokens": 10, + }, + ), + ( + RoadConfig, + { + "task_type": "SEQ_CLS", + "variant": "road_1", + "group_size": 2, + }, + ), + ( + ShiraConfig, + { + "r": 1, + "task_type": "SEQ_CLS", + "target_modules": None, + "init_weights": False, + }, + ), + ( + VBLoRAConfig, + { + "task_type": "SEQ_CLS", + "target_modules": None, + "vblora_dropout": 0.05, + "vector_length": 1, + "num_vectors": 2, + }, + ), + ( + VeraConfig, + { + "task_type": "SEQ_CLS", + "r": 8, + "target_modules": None, + "vera_dropout": 0.05, + "projection_prng_key": 0xFF, + "d_initial": 0.1, + "save_projection": True, + "bias": "none", + }, + ), + ( + C3AConfig, + { + "task_type": "SEQ_CLS", + "block_size": 1, + "target_modules": None, + }, + ), + ( + WaveFTConfig, + { + "task_type": "SEQ_CLS", + "n_frequency": 8, + "target_modules": None, + }, + ), +] + + +class TestSequenceClassificationModels(PeftCommonTester): + r""" + Tests for basic coverage of AutoModelForSequenceClassification and classification-specific cases. Most of the + functionality is probably already covered by other tests. + """ + + transformers_class = AutoModelForSequenceClassification + + def skipTest(self, reason=""): + # for backwards compatibility with unittest style test classes + pytest.skip(reason) + + def prepare_inputs_for_testing(self): + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + return {"input_ids": input_ids, "attention_mask": attention_mask} + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_attributes_parametrized(self, model_id, config_cls, config_kwargs): + self._test_model_attr(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_adapter_name(self, model_id, config_cls, config_kwargs): + self._test_adapter_name(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prepare_for_training_parametrized(self, model_id, config_cls, config_kwargs): + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_prompt_tuning_text_prepare_for_training(self, model_id, config_cls, config_kwargs): + if config_cls != PromptTuningConfig: + pytest.skip(f"This test does not apply to {config_cls}") + config_kwargs = config_kwargs.copy() + config_kwargs["prompt_tuning_init"] = PromptTuningInit.TEXT + config_kwargs["prompt_tuning_init_text"] = "This is a test prompt." + config_kwargs["tokenizer_name_or_path"] = model_id + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy(), safe_serialization=False) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters( + model_id, config_cls, config_kwargs.copy(), safe_serialization=False + ) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id", PEFT_SEQ_CLS_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) + def test_modules_to_save_correctly_set(self, model_id, config_cls, config_kwargs): + # tests for a regression, introduced via #2220, where modules_to_save was not applied to prompt learning methods + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + base_model = model.get_base_model() + # classifier layer is called either "classifier" or "score" + classifier = getattr(base_model, "classifier", getattr(base_model, "score", None)) + if classifier is None: + raise ValueError(f"Could not determine classifier layer name for {model_id}, please fix the test") + assert isinstance(classifier, ModulesToSaveWrapper) diff --git a/peft/tests/test_shira.py b/peft/tests/test_shira.py new file mode 100644 index 0000000000000000000000000000000000000000..9845ee426ea85f4d2e91ce8d95dc43c54e1ce437 --- /dev/null +++ b/peft/tests/test_shira.py @@ -0,0 +1,278 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to SHiRA. + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from torch import nn + +from peft import PeftModel, ShiraConfig, get_peft_model + + +def custom_random_mask_function_with_custom_kwargs(custom_arg): + def mask_fn(base_layer, r): + """ + This mask function is similar to the random_mask provided in src/peft/tuners/shira/mask_functions.py except the + seed is derived from custom_kwargs. Please use this as an example to create your own custom sparse masks that + may use custom_kwargs. Remember, for a pretrained weight with shape m, n, mask_fn must return only one mask + (shape: m, n) which must be binary 0 or 1 with num_shira_parameters = r(m+n) for linear layers. Device and + dtype of mask must be same as base layer's weight's device and dtype. + """ + new_seed = custom_arg + shape = base_layer.weight.shape + num_shira_weights = r * (shape[0] + shape[1]) + random_generator = torch.Generator() + random_generator.manual_seed(new_seed) + + idx = (torch.randperm(base_layer.weight.numel(), generator=random_generator)[:num_shira_weights]).to( + base_layer.weight.device + ) + val = torch.ones_like(idx.type(base_layer.weight.dtype)) + mask = torch.zeros_like(base_layer.weight.view(1, -1)) + mask = mask.scatter_(1, idx.unsqueeze(0), val.unsqueeze(0)).view(shape) + + return mask + + return mask_fn + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 40, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(40, 30, bias=bias) + self.lin3 = nn.Linear(30, 10, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestShira: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + def test_mlp_single_adapter_shapes(self, mlp): + # torch.manual_seed(0) + + r = 2 + config = ShiraConfig(r=r, target_modules=["lin1", "lin2"]) + # creates a default SHiRA adapter + peft_model = get_peft_model(mlp, config) + + shira_weight1_size = peft_model.base_model.model.lin1.shira_weight["default"].shape[0] + shira_weight2_size = peft_model.base_model.model.lin2.shira_weight["default"].shape[0] + shira_indices1_size = peft_model.base_model.model.lin1.shira_indices["default"].shape[1] + shira_indices2_size = peft_model.base_model.model.lin2.shira_indices["default"].shape[1] + + base_weight1_size = peft_model.base_model.model.lin1.base_layer.weight.shape + base_weight2_size = peft_model.base_model.model.lin2.base_layer.weight.shape + + delta_weight1_shape = peft_model.base_model.model.lin1.get_delta_weight("default").shape + delta_weight2_shape = peft_model.base_model.model.lin2.get_delta_weight("default").shape + + assert shira_weight1_size == r * (base_weight1_size[0] + base_weight1_size[1]) + assert shira_weight2_size == r * (base_weight2_size[0] + base_weight2_size[1]) + + assert shira_weight1_size == shira_indices1_size + assert shira_weight2_size == shira_indices2_size + + assert delta_weight1_shape == base_weight1_size + assert delta_weight2_shape == base_weight2_size + + return peft_model + + def test_multiple_adapters_save_load(self, mlp, tmp_path): + # check saving and loading works with multiple adapters + # note, the random seeds in the below two configs are not the default values. + # so it will lead to different random sparse masks between saving and loading. + # our goal is to make sure that loaded indices are exactly the same as the saved indices regardless of what initial random mask gets generated. + # we will also make sure that parameters are saved and loaded correctly, and the output remains the same. + config = ShiraConfig(r=2, target_modules=["lin1", "lin2"], random_seed=56) + # creates a default SHiRA adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = ShiraConfig(r=3, target_modules=["lin1", "lin2", "lin3"], random_seed=67) + peft_model.add_adapter("second", config2) + + assert torch.all(peft_model.base_model.model.lin1.shira_weight["first"] == 0) + assert torch.all(peft_model.base_model.model.lin2.shira_weight["first"] == 0) + assert torch.all(peft_model.base_model.model.lin1.shira_weight["second"] == 0) + assert torch.all(peft_model.base_model.model.lin2.shira_weight["second"] == 0) + assert torch.all(peft_model.base_model.model.lin3.shira_weight["second"] == 0) + + shira_assign_val1_f = torch.randn_like(peft_model.base_model.model.lin1.shira_weight["first"]) + peft_model.base_model.model.lin1.shira_weight["first"] = shira_assign_val1_f + shira_indices1_f = peft_model.base_model.model.lin1.shira_indices["first"] + shira_assign_val2_f = torch.randn_like(peft_model.base_model.model.lin2.shira_weight["first"]) + peft_model.base_model.model.lin2.shira_weight["first"] = shira_assign_val2_f + shira_indices2_f = peft_model.base_model.model.lin2.shira_indices["first"] + + shira_assign_val1_s = torch.randn_like(peft_model.base_model.model.lin1.shira_weight["second"]) + peft_model.base_model.model.lin1.shira_weight["second"] = shira_assign_val1_s + shira_indices1_s = peft_model.base_model.model.lin1.shira_indices["second"] + shira_assign_val2_s = torch.randn_like(peft_model.base_model.model.lin2.shira_weight["second"]) + peft_model.base_model.model.lin2.shira_weight["second"] = shira_assign_val2_s + shira_indices2_s = peft_model.base_model.model.lin2.shira_indices["second"] + shira_assign_val3_s = torch.randn_like(peft_model.base_model.model.lin3.shira_weight["second"]) + peft_model.base_model.model.lin3.shira_weight["second"] = shira_assign_val3_s + shira_indices3_s = peft_model.base_model.model.lin3.shira_indices["second"] + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + peft_model.set_adapter("second") + output_second = peft_model(input) + + # sanity check + assert not torch.allclose(output_first, output_second, atol=1e-3, rtol=1e-3) + + save_path = os.path.join(tmp_path, "shira") + peft_model.save_pretrained(save_path) + assert os.path.exists(os.path.join(save_path, "first", "adapter_config.json")) + assert os.path.exists(os.path.join(save_path, "second", "adapter_config.json")) + del peft_model + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, os.path.join(save_path, "first"), adapter_name="first") + peft_model.load_adapter(os.path.join(save_path, "second"), "second") + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + peft_model.set_adapter("second") + output_second_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded) + assert torch.allclose(output_second, output_second_loaded) + + assert torch.all(shira_assign_val1_f == peft_model.base_model.model.lin1.shira_weight["first"]) + assert torch.all(shira_assign_val2_f == peft_model.base_model.model.lin2.shira_weight["first"]) + assert torch.all(shira_indices1_f == peft_model.base_model.model.lin1.shira_indices["first"]) + assert torch.all(shira_indices2_f == peft_model.base_model.model.lin2.shira_indices["first"]) + assert torch.all(shira_assign_val1_s == peft_model.base_model.model.lin1.shira_weight["second"]) + assert torch.all(shira_assign_val2_s == peft_model.base_model.model.lin2.shira_weight["second"]) + assert torch.all(shira_assign_val3_s == peft_model.base_model.model.lin3.shira_weight["second"]) + assert torch.all(shira_indices1_s == peft_model.base_model.model.lin1.shira_indices["second"]) + assert torch.all(shira_indices2_s == peft_model.base_model.model.lin2.shira_indices["second"]) + assert torch.all(shira_indices3_s == peft_model.base_model.model.lin3.shira_indices["second"]) + + return peft_model + + def test_save_load_custom_mask_function(self, mlp, tmp_path): + # we want to see if saving and loading works when a custom mask is involved + config = ShiraConfig(r=2, mask_type="custom", target_modules=["lin1", "lin2"], init_weights=False) + custom_arg = 120 + custom_mask_fn = custom_random_mask_function_with_custom_kwargs(custom_arg) + config.mask_fn = custom_mask_fn + + # create a custom mask SHiRA adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + + shira_assign_val1_f = peft_model.base_model.model.lin1.shira_weight["first"] + shira_indices1_f = peft_model.base_model.model.lin1.shira_indices["first"] + shira_assign_val2_f = peft_model.base_model.model.lin2.shira_weight["first"] + shira_indices2_f = peft_model.base_model.model.lin2.shira_indices["first"] + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + + save_path = os.path.join(tmp_path, "shira") + peft_model.save_pretrained(save_path) + assert os.path.exists(os.path.join(save_path, "first", "adapter_config.json")) + del peft_model + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, os.path.join(save_path, "first"), adapter_name="first") + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded) + + assert torch.all(shira_assign_val1_f == peft_model.base_model.model.lin1.shira_weight["first"]) + assert torch.all(shira_assign_val2_f == peft_model.base_model.model.lin2.shira_weight["first"]) + assert torch.all(shira_indices1_f == peft_model.base_model.model.lin1.shira_indices["first"]) + assert torch.all(shira_indices2_f == peft_model.base_model.model.lin2.shira_indices["first"]) + + return peft_model + + def test_save_load_default_random_mask_with_seed_function(self, mlp, tmp_path): + # we want to see if saving and loading works when a random mask is involved but the random seed is fixed. + config = ShiraConfig(r=2, target_modules=["lin1", "lin2"], random_seed=567, init_weights=False) + + # create a custom mask SHiRA adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + + shira_assign_val1_f = peft_model.base_model.model.lin1.shira_weight["first"] + shira_indices1_f = peft_model.base_model.model.lin1.shira_indices["first"] + shira_assign_val2_f = peft_model.base_model.model.lin2.shira_weight["first"] + shira_indices2_f = peft_model.base_model.model.lin2.shira_indices["first"] + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + + save_path = os.path.join(tmp_path, "shira") + peft_model.save_pretrained(save_path) + assert os.path.exists(os.path.join(save_path, "first", "adapter_config.json")) + del peft_model + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, os.path.join(save_path, "first"), adapter_name="first") + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded) + + assert torch.all(shira_assign_val1_f == peft_model.base_model.model.lin1.shira_weight["first"]) + assert torch.all(shira_assign_val2_f == peft_model.base_model.model.lin2.shira_weight["first"]) + assert torch.all(shira_indices1_f == peft_model.base_model.model.lin1.shira_indices["first"]) + assert torch.all(shira_indices2_f == peft_model.base_model.model.lin2.shira_indices["first"]) + + return peft_model + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_shira_dtypes(self, dtype): + if dtype == torch.bfloat16: + # skip if bf16 is not supported on hardware, see #1872 + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + model = MLP().to(dtype) + config = ShiraConfig(r=2, target_modules=["lin1", "lin2"]) + peft_model = get_peft_model(model, config) + inputs = torch.randn(5, 10).to(dtype) + output = peft_model(inputs) # should not raise + assert output.dtype == dtype diff --git a/peft/tests/test_stablediffusion.py b/peft/tests/test_stablediffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..8eb18dc9a682806bab9a1e5a120160487f525ca1 --- /dev/null +++ b/peft/tests/test_stablediffusion.py @@ -0,0 +1,387 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from dataclasses import asdict, replace + +import numpy as np +import pytest +from diffusers import StableDiffusionPipeline + +from peft import ( + BOFTConfig, + HRAConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + OFTConfig, + get_peft_model, + get_peft_model_state_dict, + inject_adapter_in_model, + set_peft_model_state_dict, +) +from peft.tuners.tuners_utils import BaseTunerLayer + +from .testing_common import PeftCommonTester +from .testing_utils import set_init_weights_false, temp_seed + + +PEFT_DIFFUSERS_SD_MODELS_TO_TEST = ["hf-internal-testing/tiny-sd-pipe"] +DIFFUSERS_CONFIGS = [ + ( + LoraConfig, + { + "text_encoder": { + "r": 8, + "lora_alpha": 32, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "lora_dropout": 0.0, + "bias": "none", + "init_lora_weights": False, + }, + "unet": { + "r": 8, + "lora_alpha": 32, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "lora_dropout": 0.0, + "bias": "none", + "init_lora_weights": False, + }, + }, + ), + ( + LoHaConfig, + { + "text_encoder": { + "r": 8, + "alpha": 32, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "rank_dropout": 0.0, + "module_dropout": 0.0, + "init_weights": False, + }, + "unet": { + "r": 8, + "alpha": 32, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "rank_dropout": 0.0, + "module_dropout": 0.0, + "init_weights": False, + }, + }, + ), + ( + LoKrConfig, + { + "text_encoder": { + "r": 8, + "alpha": 32, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "rank_dropout": 0.0, + "module_dropout": 0.0, + "init_weights": False, + }, + "unet": { + "r": 8, + "alpha": 32, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "rank_dropout": 0.0, + "module_dropout": 0.0, + "init_weights": False, + }, + }, + ), + ( + OFTConfig, + { + "text_encoder": { + "r": 1, + "oft_block_size": 0, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "module_dropout": 0.0, + "init_weights": False, + "use_cayley_neumann": False, + }, + "unet": { + "r": 1, + "oft_block_size": 0, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "module_dropout": 0.0, + "init_weights": False, + "use_cayley_neumann": False, + }, + }, + ), + ( + BOFTConfig, + { + "text_encoder": { + "boft_block_num": 1, + "boft_block_size": 0, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "boft_dropout": 0.0, + "init_weights": False, + }, + "unet": { + "boft_block_num": 1, + "boft_block_size": 0, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "boft_dropout": 0.0, + "init_weights": False, + }, + }, + ), + ( + HRAConfig, + { + "text_encoder": { + "r": 8, + "target_modules": ["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + "init_weights": False, + }, + "unet": { + "r": 8, + "target_modules": [ + "proj_in", + "proj_out", + "to_k", + "to_q", + "to_v", + "to_out.0", + "ff.net.0.proj", + "ff.net.2", + ], + "init_weights": False, + }, + }, + ), +] + + +def skip_if_not_lora(config_cls): + if config_cls != LoraConfig: + pytest.skip("Skipping test because it is only applicable to LoraConfig") + + +class TestStableDiffusionModel(PeftCommonTester): + r""" + Tests that diffusers StableDiffusion model works with PEFT as expected. + """ + + transformers_class = StableDiffusionPipeline + sd_model = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-sd-pipe") + + def instantiate_sd_peft(self, model_id, config_cls, config_kwargs): + # Instantiate StableDiffusionPipeline + if model_id == "hf-internal-testing/tiny-sd-pipe": + # in CI, this model often times out on the hub, let's cache it + model = copy.deepcopy(self.sd_model) + else: + model = self.transformers_class.from_pretrained(model_id) + + config_kwargs = config_kwargs.copy() + text_encoder_kwargs = config_kwargs.pop("text_encoder") + unet_kwargs = config_kwargs.pop("unet") + # the remaining config kwargs should be applied to both configs + for key, val in config_kwargs.items(): + text_encoder_kwargs[key] = val + unet_kwargs[key] = val + + # Instantiate text_encoder adapter + config_text_encoder = config_cls(**text_encoder_kwargs) + model.text_encoder = get_peft_model(model.text_encoder, config_text_encoder) + + # Instantiate unet adapter + config_unet = config_cls(**unet_kwargs) + model.unet = get_peft_model(model.unet, config_unet) + + # Move model to device + model = model.to(self.torch_device) + + return model + + def prepare_inputs_for_testing(self): + return { + "prompt": "a high quality digital photo of a cute corgi", + "num_inference_steps": 3, + } + + @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS) + def test_merge_layers(self, model_id, config_cls, config_kwargs): + if (config_cls == LoKrConfig) and (self.torch_device not in ["cuda", "xpu"]): + pytest.skip("Merging test with LoKr fails without GPU") + + # Instantiate model & adapters + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + + # Generate output for peft modified StableDiffusion + dummy_input = self.prepare_inputs_for_testing() + with temp_seed(seed=42): + peft_output = np.array(model(**dummy_input).images[0]).astype(np.float32) + + # Merge adapter and model + if config_cls not in [LoHaConfig, OFTConfig, HRAConfig]: + # TODO: Merging the text_encoder is leading to issues on CPU with PyTorch 2.1 + model.text_encoder = model.text_encoder.merge_and_unload() + model.unet = model.unet.merge_and_unload() + + # Generate output for peft merged StableDiffusion + with temp_seed(seed=42): + merged_output = np.array(model(**dummy_input).images[0]).astype(np.float32) + + # Images are in uint8 drange, so use large atol + assert np.allclose(peft_output, merged_output, atol=1.0) + + @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS) + def test_merge_layers_safe_merge(self, model_id, config_cls, config_kwargs): + if (config_cls == LoKrConfig) and (self.torch_device not in ["cuda", "xpu"]): + pytest.skip("Merging test with LoKr fails without GPU") + + # Instantiate model & adapters + model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + + # Generate output for peft modified StableDiffusion + dummy_input = self.prepare_inputs_for_testing() + with temp_seed(seed=42): + peft_output = np.array(model(**dummy_input).images[0]).astype(np.float32) + + # Merge adapter and model + if config_cls not in [LoHaConfig, OFTConfig, HRAConfig]: + # TODO: Merging the text_encoder is leading to issues on CPU with PyTorch 2.1 + model.text_encoder = model.text_encoder.merge_and_unload(safe_merge=True) + model.unet = model.unet.merge_and_unload(safe_merge=True) + + # Generate output for peft merged StableDiffusion + with temp_seed(seed=42): + merged_output = np.array(model(**dummy_input).images[0]).astype(np.float32) + + # Images are in uint8 drange, so use large atol + assert np.allclose(peft_output, merged_output, atol=1.0) + + @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS) + def test_add_weighted_adapter_base_unchanged(self, model_id, config_cls, config_kwargs): + skip_if_not_lora(config_cls) + # Instantiate model & adapters + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + + # Get current available adapter config + text_encoder_adapter_name = next(iter(model.text_encoder.peft_config.keys())) + unet_adapter_name = next(iter(model.unet.peft_config.keys())) + text_encoder_adapter_config = replace(model.text_encoder.peft_config[text_encoder_adapter_name]) + unet_adapter_config = replace(model.unet.peft_config[unet_adapter_name]) + + # Create weighted adapters + model.text_encoder.add_weighted_adapter([unet_adapter_name], [0.5], "weighted_adapter_test") + model.unet.add_weighted_adapter([unet_adapter_name], [0.5], "weighted_adapter_test") + + # Assert that base adapters config did not change + assert asdict(text_encoder_adapter_config) == asdict(model.text_encoder.peft_config[text_encoder_adapter_name]) + assert asdict(unet_adapter_config) == asdict(model.unet.peft_config[unet_adapter_name]) + + @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS) + def test_disable_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_disable_adapter(model_id, config_cls, config_kwargs) + + @pytest.mark.parametrize("model_id", PEFT_DIFFUSERS_SD_MODELS_TO_TEST) + @pytest.mark.parametrize("config_cls,config_kwargs", DIFFUSERS_CONFIGS) + def test_load_model_low_cpu_mem_usage(self, model_id, config_cls, config_kwargs): + # Instantiate model & adapters + pipe = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + + te_state_dict = get_peft_model_state_dict(pipe.text_encoder) + unet_state_dict = get_peft_model_state_dict(pipe.unet) + + del pipe + pipe = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + + config_kwargs = config_kwargs.copy() + text_encoder_kwargs = config_kwargs.pop("text_encoder") + unet_kwargs = config_kwargs.pop("unet") + # the remaining config kwargs should be applied to both configs + for key, val in config_kwargs.items(): + text_encoder_kwargs[key] = val + unet_kwargs[key] = val + + config_text_encoder = config_cls(**text_encoder_kwargs) + config_unet = config_cls(**unet_kwargs) + + # check text encoder + inject_adapter_in_model(config_text_encoder, pipe.text_encoder, low_cpu_mem_usage=True) + # sanity check that the adapter was applied: + assert any(isinstance(module, BaseTunerLayer) for module in pipe.text_encoder.modules()) + + assert "meta" in {p.device.type for p in pipe.text_encoder.parameters()} + set_peft_model_state_dict(pipe.text_encoder, te_state_dict, low_cpu_mem_usage=True) + assert "meta" not in {p.device.type for p in pipe.text_encoder.parameters()} + + # check unet + inject_adapter_in_model(config_unet, pipe.unet, low_cpu_mem_usage=True) + # sanity check that the adapter was applied: + assert any(isinstance(module, BaseTunerLayer) for module in pipe.unet.modules()) + + assert "meta" in {p.device.type for p in pipe.unet.parameters()} + set_peft_model_state_dict(pipe.unet, unet_state_dict, low_cpu_mem_usage=True) + assert "meta" not in {p.device.type for p in pipe.unet.parameters()} diff --git a/peft/tests/test_target_parameters.py b/peft/tests/test_target_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..adffbce0d5b82f85bfe1ba36f5761807a97cbd87 --- /dev/null +++ b/peft/tests/test_target_parameters.py @@ -0,0 +1,507 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import torch +from torch import nn +from transformers import AutoModelForCausalLM + +from peft import LoraConfig, TaskType, get_peft_model + +from .testing_common import PeftCommonTester +from .testing_utils import hub_online_once, set_init_weights_false + + +ALL_CONFIGS = [ + ########## + # Llama4 # + ########## + # target down_proj + ( + "trl-internal-testing/tiny-Llama4ForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": [], + "lora_dropout": 0.0, + "target_parameters": [ + "feed_forward.experts.down_proj", + ], + }, + ), + # target gate_up_proj and down_proj, but not on the same module + ( + "trl-internal-testing/tiny-Llama4ForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": [], + "lora_dropout": 0.0, + "target_parameters": [ + "0.feed_forward.experts.gate_up_proj", + "1.feed_forward.experts.down_proj", + ], + }, + ), + # target down_proj and gate_up_proj on the same module + ( + "trl-internal-testing/tiny-Llama4ForCausalLM", + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.0, + "bias": "none", + "target_parameters": [ + "feed_forward.experts.down_proj", + "feed_forward.experts.gate_up_proj", + ], + }, + ), + # target q_proj, v_proj as modules, and down_proj as parameter + ( + "trl-internal-testing/tiny-Llama4ForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": ["q_proj", "v_proj"], + "lora_dropout": 0.0, + "target_parameters": [ + "feed_forward.experts.down_proj", + ], + }, + ), + ########### + # gpt-oss # + ########### + # target down_proj + ( + "trl-internal-testing/tiny-GptOssForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": [], + "lora_dropout": 0.0, + "target_parameters": [ + "mlp.experts.down_proj", + ], + }, + ), + # target gate_up_proj and down_proj, but not on the same module + ( + "trl-internal-testing/tiny-GptOssForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": [], + "lora_dropout": 0.0, + "target_parameters": [ + "0.mlp.experts.gate_up_proj", + "1.mlp.experts.down_proj", + ], + }, + ), + # target down_proj and gate_up_proj on the same module + ( + "trl-internal-testing/tiny-GptOssForCausalLM", + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.0, + "bias": "none", + "target_parameters": [ + "mlp.experts.down_proj", + "mlp.experts.gate_up_proj", + ], + }, + ), + # target q_proj, v_proj as modules, and down_proj as parameter + ( + "trl-internal-testing/tiny-GptOssForCausalLM", + LoraConfig, + { + "task_type": TaskType.CAUSAL_LM, + "target_modules": ["q_proj", "v_proj"], + "lora_dropout": 0.0, + "target_parameters": [ + "mlp.experts.down_proj", + ], + }, + ), +] + + +class MyAutoModelForCausalLM(AutoModelForCausalLM): + @classmethod + def from_pretrained(cls, *args, **kwargs): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(*args, **kwargs) + + # check that we load the original model, not, say, a trained checkpoint + if args[0] == "trl-internal-testing/tiny-Llama4ForCausalLM": + # model contains weights with values ~1e36 or nan, so we need to reinitialize with sane values + with torch.no_grad(): + for param in model.parameters(): + param.data = torch.randn(param.shape) + return model + + +class TestDecoderModelsTargetParameters(PeftCommonTester): + # This is more or less a copy of TestDecoderModels at the time of the PR being added. Unnecessary code is removed, + # like code required for testing non-LoRA methods. The tests being included are not selected to test specific + # functionality of targeting nn.Parameters, they (together with the tests in test_custom_models.py) just ensure that + # generally, nothing is broken. + transformers_class = MyAutoModelForCausalLM + + def skipTest(self, reason=""): + # for backwards compatibility with unittest style test classes + pytest.skip(reason) + + def prepare_inputs_for_testing(self): + input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) + attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + return {"input_ids": input_ids, "attention_mask": attention_mask} + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_attributes_parametrized(self, model_id, config_cls, config_kwargs): + self._test_model_attr(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_adapter_name(self, model_id, config_cls, config_kwargs): + self._test_adapter_name(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_prepare_for_training_parametrized(self, model_id, config_cls, config_kwargs): + self._test_prepare_for_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained(model_id, config_cls, config_kwargs.copy(), safe_serialization=False) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_save_pretrained_selected_adapters_pickle(self, model_id, config_cls, config_kwargs): + self._test_save_pretrained_selected_adapters( + model_id, config_cls, config_kwargs.copy(), safe_serialization=False + ) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + self._test_from_pretrained_config_construction(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_multi(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_multi(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_nan(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_merge_layers_nan(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + msg = "lora.ParamWrapper does not support mixed adapter batches yet." + with pytest.raises(ValueError, match=msg): + self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_with_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + msg = "lora.ParamWrapper does not support mixed adapter batches yet." + with pytest.raises(ValueError, match=msg): + self._test_generate_with_mixed_adapter_batches_and_beam_search(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_generate(self, model_id, config_cls, config_kwargs): + self._test_generate(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_pos_args(self, model_id, config_cls, config_kwargs): + self._test_generate_pos_args(model_id, config_cls, config_kwargs.copy(), raises_err=False) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): + self._test_merge_layers_fp16(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_generate_half_prec(self, model_id, config_cls, config_kwargs): + self._test_generate_half_prec(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_training_decoders(self, model_id, config_cls, config_kwargs): + self._test_training(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_training_decoders_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + self._test_training_gradient_checkpointing(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_inference_safetensors(self, model_id, config_cls, config_kwargs): + self._test_inference_safetensors(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_peft_model_device_map(self, model_id, config_cls, config_kwargs): + self._test_peft_model_device_map(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): + self._test_delete_inactive_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): + self._test_adding_multiple_adapters_with_bias_raises(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_unload_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_unload_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.skip(reason="Multiple adapters with target_parameters are not supported yet.") + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + msg = "add_weighted_adapter does not support targeting nn.Parameter" + with pytest.raises(ValueError, match=msg): + self._test_weighted_combination_of_adapters(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): + self._test_training_prompt_learning_tasks(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_disable_adapter(self, model_id, config_cls, config_kwargs): + config_kwargs = set_init_weights_false(config_cls, config_kwargs) + self._test_disable_adapter(model_id, config_cls, config_kwargs.copy()) + + @pytest.mark.parametrize("model_id,config_cls,config_kwargs", ALL_CONFIGS) + def test_passing_input_embeds_works(self, model_id, config_cls, config_kwargs): + self._test_passing_input_embeds_works("", model_id, config_cls, config_kwargs.copy()) + + +class TestTargetParameters: + # Tests specifically designed for target_parameters + def test_targeting_module_and_targeting_param_equivalent(self): + # Test that using LoRA with target_modules vs target_parameters yields identical results. + # note: we purposely target the gate_proj because its weight is not square (unlike q_proj, ...), this makes it + # easier to catch shape errors + torch.manual_seed(0) + model_id = "hf-internal-testing/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model0 = AutoModelForCausalLM.from_pretrained(model_id) + x = torch.arange(10).view(2, 5) + with torch.inference_mode(): + out_base = model0(x, output_hidden_states=True).hidden_states[-1] + + # targeting the module + config0 = LoraConfig(target_modules=["gate_proj"], init_lora_weights=False) + model0 = get_peft_model(model0, config0) + + # targeting the parameter + model1 = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-LlamaForCausalLM") + config1 = LoraConfig(target_modules=[], target_parameters=["gate_proj.weight"], init_lora_weights=False) + model1 = get_peft_model(model1, config1) + + gate_proj_0_0 = model0.base_model.model.model.layers[0].mlp.gate_proj + gate_proj_0_1 = model0.base_model.model.model.layers[1].mlp.gate_proj + gate_proj_1_0 = model1.base_model.model.model.layers[0].mlp.gate_proj + gate_proj_1_1 = model1.base_model.model.model.layers[1].mlp.gate_proj + + # ensure that the randomly initialized LoRA weights are identical + gate_proj_1_0.lora_A.default.weight.data.copy_(gate_proj_0_0.lora_A.default.weight.data) + gate_proj_1_1.lora_A.default.weight.data.copy_(gate_proj_0_1.lora_A.default.weight.data) + gate_proj_1_0.lora_B.default.weight.data.copy_(gate_proj_0_0.lora_B.default.weight.data) + gate_proj_1_1.lora_B.default.weight.data.copy_(gate_proj_0_1.lora_B.default.weight.data) + + with torch.inference_mode(): + out_lora_0 = model0(x, output_hidden_states=True).hidden_states[-1] + out_lora_1 = model1(x, output_hidden_states=True).hidden_states[-1] + + # sanity check: basemodel outputs should be different + atol, rtol = 1e-6, 1e-6 + assert not torch.allclose(out_base, out_lora_0, atol=atol, rtol=rtol) + + # LoRA outputs should be the same + assert torch.allclose(out_lora_0, out_lora_1, atol=atol, rtol=rtol) + + def test_target_multiple_parameters_on_same_module(self, monkeypatch): + # test that if we target multiple nn.Parameters on the same module, all of them are being used during the + # forward pass + torch.manual_seed(0) + model_id = "trl-internal-testing/tiny-Llama4ForCausalLM" + with hub_online_once(model_id): + x = torch.arange(10).view(2, 5) + model = MyAutoModelForCausalLM.from_pretrained(model_id) + shape_gate_up_proj = model.model.layers[0].feed_forward.experts.gate_up_proj.shape + shape_down_proj = model.model.layers[0].feed_forward.experts.down_proj.shape + num_layers = len(model.model.layers) + + target_parameters = ["feed_forward.experts.gate_up_proj", "feed_forward.experts.down_proj"] + num_params = len(target_parameters) + config = LoraConfig(target_parameters=target_parameters, init_lora_weights=False) + model = get_peft_model(model, config) + + # CHECK FORWARD CALLS + + # log the weights seen during the forward call + weights = [] + + def mock_forward(self, W): + weights.append(W) + return orig_forward(self, W) + + from peft.tuners.lora.layer import _LoraParameterProxy + + orig_forward = _LoraParameterProxy.forward + monkeypatch.setattr(_LoraParameterProxy, "forward", mock_forward) + + num_steps = 3 + with torch.inference_mode(): + for _ in range(num_steps): + out_base = model(x, output_hidden_states=True).hidden_states[-1] + + actual_call_count = len(weights) + # Note: We call forward twice per step, once to create the parametrization and once for the actual forward + # step. This may be a bit wasteful but it's not clear how to prevent this and overall is probably negligible + num_forward_per_step = 2 + # Since https://github.com/huggingface/transformers/pull/39501, one of the parameters is accessed twice per + # forward call, so add +1. + expected_call_count = num_steps * num_layers * (1 + num_params * num_forward_per_step) + assert actual_call_count == expected_call_count + + actual_shapes = {W.shape for W in weights} + expected_shapes = {shape_gate_up_proj, shape_down_proj} + assert actual_shapes == expected_shapes + + # CHECK WEIGHT UPDATES + + lora_weights_before = { + k: v.clone() for k, v in model.named_parameters() if "lora_A.default" in k or "lora_B.default" in k + } + # sanity check: + assert len(lora_weights_before) == 2 * num_layers * num_params + # train + optim = torch.optim.SGD(model.parameters(), lr=0.01) + for _ in range(10): + optim.zero_grad() + out = model(x) + loss = out.logits.sum() + loss.backward() + optim.step() + + lora_weights_after = { + k: v for k, v in model.named_parameters() if "lora_A.default" in k or "lora_B.default" in k + } + assert lora_weights_before.keys() == lora_weights_after.keys() + atol, rtol = 0.1, 0.1 + for key in lora_weights_before.keys(): + assert not torch.allclose(lora_weights_before[key], lora_weights_after[key], atol=atol, rtol=rtol) + + def test_target_parameters_works_with_existing_parametrization(self): + # When a parameter is already parametrized, we want the LoRA parametrization to work with it correctly. + class MyLinear(nn.Linear): + # For testing purposes, define a linear layer with 2 parameters: weight and other_weight. + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + nn.init.ones_(self.weight) + self.other_weight = nn.Parameter(torch.ones(self.weight.shape)) + + class MyModule(nn.Module): + def __init__(self): + super().__init__() + self.lin = MyLinear(2, 2, bias=False) + + def forward(self, x): + return self.lin(x) + + class MyParametrization(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return x + 1 + + # base model + model = MyModule() + x = torch.ones((2, 2)) + + # sanity check: result should be 1*1 + 1*1 == 2 + output_base = model(x) + assert torch.all(output_base == 2) + + # add parametrization to the weight + nn.utils.parametrize.register_parametrization(model.lin, "weight", MyParametrization()) + + # result should be (1+1)*1 + (1+1)*1 == 4 + output_parametrized = model(x) + assert torch.all(output_parametrized == 4) + + # add LoRA parametrization to the weight + config = LoraConfig(r=2, lora_alpha=6, target_parameters=["lin.weight"], init_lora_weights=False) + model = get_peft_model(model, config) + # manually set LoRA weights to ones + nn.init.ones_(model.base_model.model.lin.lora_A["default"].weight) + nn.init.ones_(model.base_model.model.lin.lora_B["default"].weight) + + output_lora = model(x) + # delta_weight should be: (1+1) * lora_scale = (1+1) * (alpha / rank) = 2 * (6 / 2) = 6 + # result should be: (1+1+6)*1 + (1+1+6)*1 == 8 + 8 == 16 + assert torch.all(output_lora == 16) + + # calling twice should yield the same result + output_lora2 = model(x) + assert torch.allclose(output_lora, output_lora2) + + # add another LoRA parametrization to other_weight, should have no effect on the output + config = LoraConfig(r=2, lora_alpha=6, target_parameters=["lin.other_weight"], init_lora_weights=False) + model.add_adapter("other", config) + + output_other_lora = model(x) + # delta_weight should be: (1+1) * lora_scale = (1+1) * (alpha / rank) = 2 * (6 / 2) = 6 + # result should be: (1+1+6)*1 + (1+1+6)*1 == 8 + 8 == 16 + assert torch.all(output_other_lora == output_lora) + + # after unloading, the output should be the same as before LoRA was applied + unloaded = model.unload() + output_unloaded = unloaded(x) + assert torch.all(output_unloaded == output_parametrized) diff --git a/peft/tests/test_torch_compile.py b/peft/tests/test_torch_compile.py new file mode 100644 index 0000000000000000000000000000000000000000..a7ad045b91e0229dadbb2d13f9bf46925f0b2412 --- /dev/null +++ b/peft/tests/test_torch_compile.py @@ -0,0 +1,599 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The intent of the tests contained in this file is to check as many PEFT features as possible with torch.compile. This +# is thus a document on how well torch.compile is supported by PEFT. Currently, we know that certain features do not +# work with torch.compile. The corresponding tests should be marked with `@pytest.mark.xfail(strict=True)`. +# +# When adding a new test that fails with torch.compile, please make sure first that it does NOT fail without +# torch.compile. + +import gc +import os + +import pytest +import torch +from accelerate.utils.memory import clear_device_cache +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForLanguageModeling, + Trainer, + TrainerCallback, + TrainingArguments, +) + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + HRAConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + MissConfig, + OFTConfig, + PeftModel, + TaskType, + VBLoRAConfig, + VeraConfig, + get_peft_model, +) + +from .testing_utils import load_dataset_english_quotes, require_bitsandbytes + + +# only run (very slow) torch.compile tests when explicitly asked to +if os.environ.get("PEFT_DEBUG_WITH_TORCH_COMPILE") != "1": + pytest.skip(allow_module_level=True) + + +# Mapping: name of the setting -> (Peft config instance, torch.compile kwargs) +SETTINGS = { + "adalora": (AdaLoraConfig(task_type=TaskType.CAUSAL_LM, total_step=5), {}), + "boft": (BOFTConfig(task_type=TaskType.CAUSAL_LM), {}), + "dora": (LoraConfig(task_type=TaskType.CAUSAL_LM, use_dora=True), {}), + "ia3": (IA3Config(task_type=TaskType.CAUSAL_LM), {}), + "ln_tuning": (LNTuningConfig(task_type=TaskType.CAUSAL_LM, target_modules=["final_layer_norm"]), {}), + "loha": (LoHaConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"]), {}), + "lokr": pytest.param( + (LoKrConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"]), {}), + ), + "lora": (LoraConfig(task_type=TaskType.CAUSAL_LM), {}), + "lora-target-embeddings": pytest.param( + (LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["embed_tokens"]), {}), + ), + "lora-with-modules-to-save": (LoraConfig(task_type=TaskType.CAUSAL_LM, modules_to_save=["embed_tokens"]), {}), + "oft": (OFTConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"]), {}), + "vblora": (VBLoRAConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], vector_length=2), {}), + "vera": (VeraConfig(task_type=TaskType.CAUSAL_LM), {}), + "hra": (HRAConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"]), {}), + "bone": (BoneConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], r=2), {}), + "bone-bat": ( + BoneConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], r=2, init_weights="bat"), + {}, + ), + "miss": (MissConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], r=2), {}), + "miss-bat": ( + MissConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], r=2, init_weights="bat"), + {}, + ), + "miss-mini": ( + MissConfig(task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "v_proj"], r=2, init_weights="mini"), + {}, + ), +} + + +@pytest.mark.single_gpu_tests +class TestTorchCompileCausalLM: + """ + Tests for using torch.compile with causal LM. + + Tip: When adding a new test, set `fake_compile = True` below. With this setting, torch.compile is being skipped. + This is useful for two reasons: + + - compile is slow, so to quickly iterate on the test, it's best to disable it and only enable it at the very end + - even if you expect the test to fail with compile, as compile does not work with every PEFT feature, it still MUST + succeed without compile, otherwise the test is incorrect. + + Before creating the PR, disable `fake_compile`. + """ + + fake_compile = False + model_id = "hf-internal-testing/tiny-random-OPTForCausalLM" + max_train_loss = 15.0 # generous threshold for maximum loss after training + + @pytest.fixture(autouse=True) + def teardown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + gc.collect() + + @pytest.fixture(scope="class") + def tokenizer(self): + return AutoTokenizer.from_pretrained(self.model_id) + + @pytest.fixture(scope="class") + def data(self, tokenizer): + def tokenize(samples): + # For some reason, the max sequence length is not honored by the tokenizer, resulting in IndexErrors. Thus, + # manually ensure that sequences are not too long. + tokenized = tokenizer(samples["quote"]) + tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]] + tokenized["attention_mask"] = [ + input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"] + ] + return tokenized + + data = load_dataset_english_quotes() + data = data.map(tokenize, batched=True) + # We need to manually remove unused columns. This is because we cannot use remove_unused_columns=True in the + # Trainer, as this leads to errors with torch.compile. We also cannot just leave them in, as they contain + # strings. Therefore, manually remove all unused columns. + data = data.remove_columns(["quote", "author", "tags"]) + return data + + def compile(self, model, compile_kwargs): + compile_kwargs = compile_kwargs.copy() + # those are only for the Trainer arguments + compile_kwargs.pop("torch_compile_backend", None) + compile_kwargs.pop("torch_compile_mode", None) + if self.fake_compile: + return model + return torch.compile(model, **compile_kwargs) + + @pytest.mark.parametrize("settings", SETTINGS.values(), ids=SETTINGS.keys()) + def test_causal_lm_training_trainer_compile(self, settings, tokenizer, data, tmp_path): + r"""Train a PEFT model with torch.compile using Trainer""" + tmp_dir = tmp_path / "model" + config, compile_kwargs = settings + + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + ) + model = get_peft_model(model, config) + + # record outputs before training + model.eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_before = model(sample) + model.train() + + train_kwargs = { + "per_device_train_batch_size": 4, + "max_steps": 5, + "learning_rate": 1e-3, + "logging_steps": 1, + "output_dir": tmp_dir, + "seed": 0, + } + + if isinstance(config, AdaLoraConfig): + train_kwargs["learning_rate"] = 1e-2 + + training_args = TrainingArguments( + torch_compile=not self.fake_compile, + torch_compile_backend=compile_kwargs.get("torch_compile_backend", None), + torch_compile_mode=compile_kwargs.get("torch_compile_mode", None), + **train_kwargs, + ) + trainer = Trainer( + model=model, + train_dataset=data["train"], + args=training_args, + data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + + if isinstance(config, AdaLoraConfig): + + class OptimizerStepCallback(TrainerCallback): + def on_optimizer_step(self, args, state, control, **kwargs): + model.update_and_allocate(state.global_step) + + trainer.add_callback(OptimizerStepCallback()) + + trainer.train() + + model.eval() + atol, rtol = 1e-4, 1e-4 + with torch.inference_mode(): + output_after = model(sample) + tokens_after = model.generate(sample) + assert torch.isfinite(output_after.logits).all() + # sanity check: model was updated + assert not torch.allclose(output_before.logits, output_after.logits, atol=atol, rtol=rtol) + assert trainer.state.log_history[-1]["train_loss"] < self.max_train_loss + + # check saving the model and loading it without compile + model.save_pretrained(tmp_path) + del model + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="auto") + model = PeftModel.from_pretrained(model, tmp_path) + with torch.inference_mode(): + output_loaded = model(sample) + tokens_loaded = model.generate(sample) + assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol) + assert (tokens_after == tokens_loaded).all() + + @pytest.mark.parametrize("settings", SETTINGS.values(), ids=SETTINGS.keys()) + def test_causal_lm_training_pytorch_compile(self, settings, tokenizer, data, tmp_path): + r"""Train a PEFT model with torch.compile using PyTorch training loop""" + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + ) + config, compile_kwargs = settings + model = get_peft_model(model, config) + if isinstance(config, AdaLoraConfig): + model.base_model.peft_config["default"].total_step = 5 + model = self.compile(model, compile_kwargs) + + # record outputs before training + model.eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_before = model(sample) + model.train() + + model.config.use_cache = False + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) + batch_size = 4 + losses = [] + max_steps = 5 * batch_size + for i in range(0, max_steps, batch_size): + batch = tokenizer.pad(data["train"][i : i + batch_size], return_tensors="pt").to(model.device) + # add targets + batch["labels"] = batch["input_ids"].clone() + optimizer.zero_grad() + outputs = model(**batch) + loss = outputs.loss + loss.backward() + optimizer.step() + losses.append(loss.item()) + if isinstance(config, AdaLoraConfig): + model.base_model.update_and_allocate(i) + + model.eval() + with torch.inference_mode(): + output_after = model(sample) + tokens_after = model.generate(sample) + assert torch.isfinite(output_after.logits).all() + atol, rtol = 1e-4, 1e-4 + # sanity check: model was updated + assert not torch.allclose(output_before.logits, output_after.logits, atol=atol, rtol=rtol) + assert losses[-1] < self.max_train_loss + + # check saving the model and loading it without compile + model.save_pretrained(tmp_path) + del model + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained(self.model_id, device_map="auto") + model = PeftModel.from_pretrained(model, tmp_path) + with torch.inference_mode(): + output_loaded = model(sample) + tokens_loaded = model.generate(sample) + assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol) + assert (tokens_after == tokens_loaded).all() + + @require_bitsandbytes + def test_causal_lm_training_lora_bnb_compile(self, tokenizer, data, tmp_path): + r"""Train a bnb quantized LoRA model with torch.compile using PyTorch training loop""" + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ) + config = LoraConfig(task_type=TaskType.CAUSAL_LM) + model = get_peft_model(model, config) + model = self.compile(model, {}) + + # record outputs before training + model.eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_before = model(sample) + model.train() + + model.config.use_cache = False + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) + batch_size = 4 + losses = [] + max_steps = 5 * batch_size + for i in range(0, max_steps, batch_size): + batch = tokenizer.pad(data["train"][i : i + batch_size], return_tensors="pt").to(model.device) + # add targets + batch["labels"] = batch["input_ids"].clone() + optimizer.zero_grad() + outputs = model(**batch) + loss = outputs.loss + loss.backward() + optimizer.step() + losses.append(loss.item()) + + model.eval() + with torch.inference_mode(): + output_after = model(sample) + assert torch.isfinite(output_after.logits).all() + atol, rtol = 5e-4, 5e-4 + # sanity check: model was updated + assert not torch.allclose(output_before.logits, output_after.logits, atol=atol, rtol=rtol) + assert losses[-1] < self.max_train_loss + + # check saving the model and loading it without compile + model.save_pretrained(tmp_path) + del model + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, device_map="auto", quantization_config=BitsAndBytesConfig(load_in_4bit=True) + ) + model = PeftModel.from_pretrained(model, tmp_path) + + with torch.inference_mode(): + # after loading, outputs are float32 for some reason + output_loaded = model(sample) + assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_multiple_lora_adapter_compile(self, tokenizer, data): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config) + model.add_adapter("other", config) + model = self.compile(model, {}) + model.eval() + + with torch.inference_mode(): + output_default_adapter = model(sample) + model.set_adapter("other") + with torch.inference_mode(): + output_other_adapter = model(sample) + + atol, rtol = 1e-4, 1e-4 + # outputs of the base model != output of default adapter != output of other adapter + assert not torch.allclose(output_base.logits, output_default_adapter.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_base.logits, output_other_adapter.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default_adapter.logits, output_other_adapter.logits, atol=atol, rtol=rtol) + + # now delete the other adapter + model.delete_adapter("other") + model.set_adapter("default") + with torch.inference_mode(): + output_after_delete = model(sample) + + # outputs after delete == output of default adapter + assert torch.allclose(output_default_adapter.logits, output_after_delete.logits, atol=atol, rtol=rtol) + + def test_causal_lm_disable_lora_adapter_compile(self, tokenizer, data): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + model = self.compile(model, {}) + output_lora = model(sample) + + with model.disable_adapter(): + with torch.inference_mode(): + output_disabled = model(sample) + + atol, rtol = 5e-4, 5e-4 + # outputs of the base model == output disabled adapter != output of lora adapter + assert torch.allclose(output_base.logits, output_disabled.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_merging_lora_adapter_compile(self, tokenizer, data): + # merge the adapter + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + with torch.inference_mode(): + output_lora = model(sample) + + model.merge_adapter() + with torch.inference_mode(): + output_merged = model(sample) + + # merging is less precise, be more tolerant + atol, rtol = 1e-1, 1e-1 + # outputs of the base model != output of lora adapter == output of merged adapter + assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol) + assert torch.allclose(output_lora.logits, output_merged.logits, atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_merging_multiple_lora_adapters_compile(self, tokenizer, data): + # merge multiple adapters at once + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + model.add_adapter("other", config) + with torch.inference_mode(): + output_default = model(sample) + + model.set_adapter("other") + with torch.inference_mode(): + output_other = model(sample) + + model.base_model.merge_adapter(["default", "other"]) + with torch.inference_mode(): + output_merged = model(sample) + + # merging is less precise, be more tolerant + atol, rtol = 1e-1, 1e-1 + # outputs of the base model != output of default adapter != output of other adapter + assert not torch.allclose(output_base.logits, output_default.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_base.logits, output_other.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default.logits, output_other.logits, atol=atol, rtol=rtol) + # outputs of merged adapter != all others + assert not torch.allclose(output_base.logits, output_merged.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default.logits, output_merged.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_other.logits, output_merged.logits, atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_merge_and_unload_lora_adapter_compile(self, tokenizer, data): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + model = self.compile(model, {}) + with torch.inference_mode(): + output_lora = model(sample) + + unloaded = model.merge_and_unload() + with torch.inference_mode(): + output_unloaded = unloaded(sample) + + # merging is less precise, be more tolerant + atol, rtol = 1e-1, 1e-1 + # outputs of the base model != output of lora adapter == output of unloaded adapter + assert not torch.allclose(output_base.logits, output_lora.logits, atol=atol, rtol=rtol) + assert torch.allclose(output_lora.logits, output_unloaded.logits, atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_mixed_batch_lora_adapter_compile(self, tokenizer, data): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + + # we need at least 3 samples for this to work! + sample = { + "input_ids": torch.arange(12).reshape(3, 4).to("cuda"), + "attention_mask": torch.ones(3, 4).long().to("cuda"), + } + + with torch.inference_mode(): + output_base = model(**sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + with torch.inference_mode(): + output_default = model(**sample) + + model.add_adapter("other", config) + model.set_adapter("other") + with torch.inference_mode(): + output_other = model(**sample) + + model = self.compile(model, {}) + + # set adapter_indices so that it alternates between 0 (base), lora 1, and lora 2 + adapter_names = ["__base__", "default", "other"] + with torch.inference_mode(): + output_mixed = model(**sample, adapter_names=adapter_names) + + atol, rtol = 5e-4, 5e-4 + # outputs of the base model != output of lora adapter 1 != output of other adapter + assert not torch.allclose(output_base.logits, output_default.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default.logits, output_other.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_other.logits, output_mixed.logits, atol=atol, rtol=rtol) + # outputs of mixed adapter is mix of all 3 + assert torch.allclose(output_base.logits[0], output_mixed.logits[0], atol=atol, rtol=rtol) + assert torch.allclose(output_default.logits[1], output_mixed.logits[1], atol=atol, rtol=rtol) + assert torch.allclose(output_other.logits[2], output_mixed.logits[2], atol=atol, rtol=rtol) + + @require_bitsandbytes + def test_causal_lm_add_weighted_adapter_lora_adapter_compile(self, tokenizer, data): + torch.manual_seed(0) + model = AutoModelForCausalLM.from_pretrained( + self.model_id, + device_map="auto", + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + ).eval() + sample = torch.tensor(data["train"][:1]["input_ids"]).to(model.device) + with torch.inference_mode(): + output_base = model(sample) + + config = LoraConfig(task_type=TaskType.CAUSAL_LM, init_lora_weights=False) + model = get_peft_model(model, config).eval() + model.add_adapter("other", config) + with torch.inference_mode(): + output_default = model(sample) + + model.set_adapter("other") + with torch.inference_mode(): + output_other = model(sample) + + model.add_weighted_adapter(["default", "other"], [0.5, 0.5], adapter_name="combined") + model.set_adapter("combined") + with torch.inference_mode(): + output_combined = model(sample) + + atol, rtol = 1e-4, 1e-4 + # outputs of the base model != output of default adapter != output of other adapter + assert not torch.allclose(output_base.logits, output_default.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_base.logits, output_other.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default.logits, output_other.logits, atol=atol, rtol=rtol) + # outputs of combined adapter != all others + assert not torch.allclose(output_base.logits, output_combined.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_default.logits, output_combined.logits, atol=atol, rtol=rtol) + assert not torch.allclose(output_other.logits, output_combined.logits, atol=atol, rtol=rtol) diff --git a/peft/tests/test_trainable_tokens.py b/peft/tests/test_trainable_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..38b32b06ed35f89c730af9d11945f9c7924d1f67 --- /dev/null +++ b/peft/tests/test_trainable_tokens.py @@ -0,0 +1,920 @@ +# Copyright 2025-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import copy + +import pytest +import torch +from safetensors.torch import load_file as safe_load_file +from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer + +from peft import AutoPeftModel, LoraConfig, PeftModel, TrainableTokensConfig, get_peft_model +from peft.tuners.trainable_tokens.layer import TrainableTokensLayer +from peft.utils import TrainableTokensWrapper, get_peft_model_state_dict + +from .testing_utils import hub_online_once + + +class ModelEmb(torch.nn.Module): + def __init__(self): + super().__init__() + self.emb = torch.nn.Embedding(100, 10) + self.lin0 = torch.nn.Linear(10, 1) + + def forward(self, x): + return self.lin0(self.emb(x)) + + def get_input_embeddings(self): + return self.emb + + +class ModelEmbedIn(torch.nn.Module): + def __init__(self): + super().__init__() + self.embed_in = torch.nn.Embedding(100, 10) + self.lin0 = torch.nn.Linear(10, 1) + + def forward(self, x): + return self.lin0(self.embed_in(x)) + + def get_input_embeddings(self): + return self.embed_in + + +class ModelEmbedMultiple(torch.nn.Module): + def __init__(self): + super().__init__() + self.embed_in = torch.nn.Embedding(100, 10) + self.embed_in_2 = torch.nn.Embedding(100, 10) + self.lin0 = torch.nn.Linear(10, 1) + + def forward(self, x): + return self.lin0(self.embed_in(x) + self.embed_in_2(x)) + + def get_input_embeddings(self): + return self.embed_in + + +class ModelEmbedInNoGet(torch.nn.Module): + def __init__(self): + super().__init__() + self.embed_in = torch.nn.Embedding(100, 10) + self.lin0 = torch.nn.Linear(10, 1) + + def forward(self, x): + return self.lin0(self.embed_in(x)) + + +class TestTrainableTokens: + @pytest.fixture + def model_id(self): + return "trl-internal-testing/tiny-random-LlamaForCausalLM" + + @pytest.fixture + def model_multi_embedding(self): + class MultiEmbeddingMLP(torch.nn.Module): + def __init__(self): + super().__init__() + self.emb_text = torch.nn.Embedding(10, 5) + self.emb_image = torch.nn.Embedding(8, 5) + self.lin0 = torch.nn.Linear(5, 10) + self.lin1 = torch.nn.Linear(10, 20) + + def forward(self, x_text, x_image): + x_text = self.emb_text(x_text) + x_image = self.emb_image(x_image) + y = self.lin0(torch.concat([x_text, x_image], dim=1).view(-1, 5)) + y = self.lin1(y) + return y, (x_text, x_image) + + return MultiEmbeddingMLP() + + @pytest.fixture + def model(self, model_id): + with hub_online_once(model_id): + # This must not be a yield fixture so that we don't carry the hub_online_once + # behavior over to the rest of the test that uses this fixture + return AutoModelForCausalLM.from_pretrained(model_id) + + @pytest.fixture + def tokenizer(self, model_id): + return AutoTokenizer.from_pretrained(model_id) + + def simulate_training(self, trainable_tokens_layer, adapter_name="default"): + """Simulates training of trainable_tokens adapter layer by assigning random + values to the delta tokens. + """ + trainable_tokens_layer.trainable_tokens_delta[adapter_name].data = torch.rand_like( + trainable_tokens_layer.trainable_tokens_delta[adapter_name].data + ) + + def test_stand_alone_usage(self, model, tokenizer, tmp_path): + original_model = copy.deepcopy(model) + + peft_config = TrainableTokensConfig(target_modules=["embed_tokens"], token_indices=[0, 1, 3]) + peft_model = get_peft_model(model, peft_config) + save_path = tmp_path / "stand_alone_usage" + + # simulate normal use but take care to use the tokens that we expect to be modified + # (+1 that we don't expect to be modified) + X = { + "input_ids": torch.tensor([[0, 1, 2, 3]]), + "attention_mask": torch.tensor([[1, 1, 1, 1]]), + } + + idcs_to_modify = peft_config.token_indices + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + self.simulate_training(peft_model.model.model.embed_tokens) + output_train = peft_model(output_hidden_states=True, **X) + + peft_model.save_pretrained(save_path) + peft_model_org = peft_model + + # check whether the token indices differ from the base model after loading the model + # from the checkpoint. + peft_model = AutoPeftModel.from_pretrained(save_path) + output_load = peft_model(output_hidden_states=True, **X) + output_orig = original_model(output_hidden_states=True, **X) + + # on the way, make sure that the embedding matrix itself was not modified + assert torch.allclose( + peft_model.model.model.embed_tokens.weight, + peft_model_org.model.model.embed_tokens.weight, + ) + + W_load = output_load.hidden_states[0] + W_orig = output_orig.hidden_states[0] + W_train = output_train.hidden_states[0] + + # all PEFT model embed outputs must equal the outputs during 'training' to make sure + # that saving/loading works properly. + assert torch.allclose(W_load, W_train) + + assert not torch.allclose(W_load[:, idcs_to_modify], W_orig[:, idcs_to_modify]) + assert torch.allclose(W_load[:, idcs_to_keep], W_orig[:, idcs_to_keep]) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_combined_with_peft_method_usage(self, model, tokenizer, peft_config, tmp_path): + original_model = copy.deepcopy(model) + peft_model = get_peft_model(model, peft_config) + save_path = tmp_path / "combined_usage" + + # simulate normal use but take care to use the tokens that we expect to be modified + # (+2 that we don't expect to be modified) + X = { + "input_ids": torch.tensor([[0, 1, 2, 3, 4]]), + "attention_mask": torch.tensor([[1, 1, 1, 1, 1]]), + } + + idcs_to_modify = peft_config.trainable_token_indices["embed_tokens"] + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + self.simulate_training(peft_model.model.model.embed_tokens.token_adapter) + output_train = peft_model(output_hidden_states=True, **X) + + peft_model.save_pretrained(save_path) + peft_model_org = peft_model + + # check whether the token indices differ from the base model + peft_model = AutoPeftModel.from_pretrained(save_path) + output_load = peft_model(output_hidden_states=True, **X) + output_orig = original_model(output_hidden_states=True, **X) + + W_load = output_load.hidden_states[0] + W_orig = output_orig.hidden_states[0] + W_train = output_train.hidden_states[0] + + # all PEFT model embed outputs must equal the outputs during 'training' to make sure + # that saving/loading works properly. + assert torch.allclose(W_load, W_train) + + assert not torch.allclose(W_load[:, idcs_to_modify], W_orig[:, idcs_to_modify]) + assert torch.allclose(W_load[:, idcs_to_keep], W_orig[:, idcs_to_keep]) + + def test_basic_training(self, model, tokenizer): + # ensure that the model can be trained and backpropagation works + config = TrainableTokensConfig( + target_modules=["embed_tokens"], + token_indices=[0, 10], + ) + + model = get_peft_model(model, config) + optimizer = torch.optim.AdamW(model.parameters(), lr=1) + + initial_delta = model.model.model.embed_tokens.trainable_tokens_delta.default.clone() + initial_originals = model.model.model.embed_tokens.trainable_tokens_original.default.clone() + + X = { + "input_ids": torch.tensor([[0, 1, 2, 3, 4]]), + "attention_mask": torch.tensor([[1, 1, 1, 1, 1]]), + } + + for step in range(3): + optimizer.zero_grad() + y_pred = model(**X) + loss = y_pred.logits.mean() + loss.backward() + optimizer.step() + + assert torch.allclose( + model.model.model.embed_tokens.trainable_tokens_original.default, + initial_originals, + ) + assert not torch.allclose( + model.model.model.embed_tokens.trainable_tokens_delta.default, + initial_delta, + ) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_disable_adapters_with_merging(self, model, tokenizer, peft_config): + X = { + "input_ids": torch.tensor([[0, 1, 2, 3, 4]]), + "attention_mask": torch.tensor([[1, 1, 1, 1, 1]]), + } + + model = get_peft_model(model, peft_config) + model.eval() + + outputs_before = model(**X).logits + + model.train() + lr = 0.01 + optimizer = torch.optim.Adam(model.parameters(), lr=lr) + + # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry + # breaking of some LoRA layers that are initialized with constants) + for _ in range(3): + optimizer.zero_grad() + y_pred = model(**X) + loss = y_pred.logits.mean() + loss.backward() + optimizer.step() + + model.eval() + outputs_unmerged = model(**X).logits + model.merge_adapter() + outputs_after = model(**X).logits + + with model.disable_adapter(): + outputs_disabled = model(**X).logits + + # check that after leaving the disable_adapter context, everything is enabled again + outputs_enabled_after_disable = model(**X).logits + + atol, rtol = 1e-5, 1e-5 # tolerances higher than defaults since merging introduces some numerical instability + + # check that there is a difference in results after training + assert not torch.allclose(outputs_before, outputs_after, atol=atol, rtol=rtol) + + # unmerged or merged should make no difference + assert torch.allclose(outputs_after, outputs_unmerged, atol=atol, rtol=rtol) + + # check that disabling adapters gives the same results as before training + assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol) + + # check that enabling + disabling adapters does not change the results + assert torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_safe_merge_with_adapter(self, model, tokenizer, peft_config): + X = { + "input_ids": torch.tensor([[0, 1, 2, 3]]), + "attention_mask": torch.tensor([[1, 1, 1, 1]]), + } + + model = model.eval() + logits_base = model(**X).logits + + model = get_peft_model(model, peft_config).eval() + logits_peft = model(**X).logits + + atol, rtol = 1e-6, 1e-6 # default + + model_unloaded = model.merge_and_unload(safe_merge=True) + logits_unloaded = model_unloaded(**X).logits + + # check that the logits are the same after unloading + assert torch.allclose(logits_peft, logits_unloaded, atol=atol, rtol=rtol) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_load_multiple_adapters(self, model, peft_config, tmp_path): + # tests if having more than one adpater (even with just the same config) works + original_model = copy.deepcopy(model) + model = get_peft_model(model, peft_config) + + model.save_pretrained(tmp_path) + del model + + model = original_model + model = PeftModel.from_pretrained(model, tmp_path) + load_result1 = model.load_adapter(tmp_path, adapter_name="other") + load_result2 = model.load_adapter(tmp_path, adapter_name="yet-another") + + assert load_result1.missing_keys == [] + assert load_result2.missing_keys == [] + + @pytest.mark.parametrize( + "peft_config_factory", + [ + lambda token_indices: LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": token_indices}, + ), + ], + ) + def test_multiple_adapters_different_token_indices(self, model, peft_config_factory, tmp_path): + # tests if multiple adapters with different token indices work + original_model = copy.deepcopy(model) + + token_indices_1 = [0, 1, 2] + token_indices_2 = [2, 3, 4] + + peft_config_1 = peft_config_factory(token_indices_1) + peft_config_2 = peft_config_factory(token_indices_2) + + model = get_peft_model(model, peft_config_1, adapter_name="adapter_1") + model.add_adapter("adapter_2", peft_config_2) + + # "train" adapter 1 + model.set_adapter("adapter_1") + self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_1") + + # "train" adapter 2 + model.set_adapter("adapter_2") + self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_2") + + # now we infer on adapter 1 and on adapter 2 and check if the requested indices are changed for + # each adapter. e.g., for adapter 1, only token indices 1 should be changed. + X = { + "input_ids": torch.tensor([list(set(token_indices_1 + token_indices_2))]), + "attention_mask": torch.tensor([[1] * (len(set(token_indices_1 + token_indices_2)))]), + } + + original_output = original_model(output_hidden_states=True, **X).hidden_states[0] + + # infer with adapter 1, embeddings for token indices 1 should be changed, no others. + model.set_adapter("adapter_1") + adapter_1_output = model(output_hidden_states=True, **X).hidden_states[0] + + idcs_to_modify = token_indices_1 + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + assert not torch.allclose(adapter_1_output[:, idcs_to_modify], original_output[:, idcs_to_modify]) + assert torch.allclose(adapter_1_output[:, idcs_to_keep], original_output[:, idcs_to_keep]) + + # infer with adapter 2, embeddings for token indices 2 should be changed, no others. + model.set_adapter("adapter_2") + adapter_2_output = model(output_hidden_states=True, **X).hidden_states[0] + + idcs_to_modify = token_indices_2 + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + assert not torch.allclose(adapter_2_output[:, idcs_to_modify], original_output[:, idcs_to_modify]) + assert torch.allclose(adapter_2_output[:, idcs_to_keep], original_output[:, idcs_to_keep]) + + @pytest.mark.parametrize( + "peft_config_factory", + [ + lambda token_indices: LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": token_indices}, + ), + ], + ) + def test_multiple_adapters_overlapping_token_indices_merging(self, model, peft_config_factory, tmp_path): + # tests that merging multiple adapters that have overlapping indices is not defined at the moment + # and would yield undefined behavior. note that merging a single adapter is fine. + original_model = copy.deepcopy(model) + + token_indices_1 = [0, 1, 2] + token_indices_2 = [2, 3, 4] + + peft_config_1 = peft_config_factory(token_indices_1) + peft_config_2 = peft_config_factory(token_indices_2) + + model = get_peft_model(model, peft_config_1, adapter_name="adapter_1") + model.add_adapter("adapter_2", peft_config_2) + + with pytest.raises(ValueError) as e: + model.merge_and_unload(adapter_names=["adapter_1", "adapter_2"]) + assert "are already defined and would result in undefined merging behavior" in str(e) + + @pytest.mark.parametrize( + "peft_config_factory", + [ + lambda targets, token_indices: LoraConfig( + target_modules=targets, + trainable_token_indices={"embed_tokens": token_indices}, + ), + ], + ) + def test_multiple_adapters_mixed_forward(self, model, peft_config_factory, tmp_path): + # tests if multiple adapters with different token indices work + original_model = copy.deepcopy(model) + + token_indices_1 = [0, 1, 2] + token_indices_2 = [2, 3, 4] + + peft_config_1 = peft_config_factory(".*q_proj", token_indices_1) + peft_config_2 = peft_config_factory(".*o_proj", token_indices_2) + + model = get_peft_model(model, peft_config_1, adapter_name="adapter_1") + model.add_adapter("adapter_2", peft_config_2) + + # "train" adapter 1 + model.set_adapter("adapter_1") + self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_1") + + # "train" adapter 2 + model.set_adapter("adapter_2") + self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_2") + + # forward(adapter_names=...) is not available in train mode + model.eval() + + # Build a batch of 2 items, each the same input sequence but each sequence will be passed to a different + # adapter via mixed batch forward. + input_sequence = list(set(token_indices_1 + token_indices_2)) + X = { + "input_ids": torch.tensor([input_sequence, input_sequence]), + "attention_mask": torch.tensor([[1] * len(input_sequence), [1] * len(input_sequence)]), + } + batch_adapter_names = ["adapter_1", "adapter_2"] + + original_output = original_model(output_hidden_states=True, **X) + mixed_output = model(output_hidden_states=True, adapter_names=batch_adapter_names, **X) + + # check that the active adapter is still the last activated adapter, adapter_2 + assert model.model.model.embed_tokens.token_adapter.active_adapter == ["adapter_2"] + + adapter_1_output = mixed_output.hidden_states[0][0:1] + original_output_1 = original_output.hidden_states[0][0:1] + adapter_2_output = mixed_output.hidden_states[0][1:2] + original_output_2 = original_output.hidden_states[0][1:2] + + idcs_to_modify = token_indices_1 + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + assert not torch.allclose(adapter_1_output[:, idcs_to_modify], original_output_1[:, idcs_to_modify]) + assert torch.allclose(adapter_1_output[:, idcs_to_keep], original_output_1[:, idcs_to_keep]) + + idcs_to_modify = token_indices_2 + idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify] + + assert not torch.allclose(adapter_2_output[:, idcs_to_modify], original_output_2[:, idcs_to_modify]) + assert torch.allclose(adapter_2_output[:, idcs_to_keep], original_output_2[:, idcs_to_keep]) + + def test_stand_alone_raises_target_layer_not_found(self, model): + config = TrainableTokensConfig(target_modules=["doesnt_exist"], token_indices=[0, 1, 3]) + with pytest.raises(ValueError) as e: + model = get_peft_model(model, config) + assert "Target modules ['doesnt_exist'] not found in the base model." in str(e) + + @pytest.mark.parametrize( + "peft_config, target_layer_name", + [ + (LoraConfig(trainable_token_indices={"does-not-exist": [0, 1, 2]}), "does-not-exist"), + ], + ) + def test_combined_with_peft_raises_target_layer_not_found(self, model, peft_config, target_layer_name): + # same as test_stand_alone_raises_target_layer_not_found but tests the peft method integration + with pytest.raises(ValueError) as e: + model = get_peft_model(model, peft_config) + assert f"Target modules {{{repr(target_layer_name)}}} not found in the base model." in str(e) + + def test_multiple_targets(self, model_multi_embedding): + # tests the ability of targeting two modules with the same token indices + original_model = copy.deepcopy(model_multi_embedding) + config = TrainableTokensConfig(target_modules=["emb_text", "emb_image"], token_indices=[0, 1]) + peft_model = get_peft_model(model_multi_embedding, config) + + self.simulate_training(peft_model.model.emb_text) + self.simulate_training(peft_model.model.emb_image) + + X = { + "x_text": torch.tensor([[0, 1, 2]]), + "x_image": torch.tensor([[0, 1, 2]]), + } + + _, (emb_text_orig, emb_image_orig) = original_model(**X) + _, (emb_text_peft, emb_image_peft) = peft_model(**X) + + assert not torch.allclose(emb_text_orig[:, [0, 1]], emb_text_peft[:, [0, 1]]) + assert torch.allclose(emb_text_orig[:, [2]], emb_text_peft[:, [2]]) + assert not torch.allclose(emb_image_orig[:, [0, 1]], emb_image_peft[:, [0, 1]]) + assert torch.allclose(emb_image_orig[:, [2]], emb_image_peft[:, [2]]) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_no_embeddings_in_save_with_combined_usage(self, model, tokenizer, peft_config, tmp_path): + # make sure that in combined use the only state dict key is that of the token deltas and nothing more + + peft_model = get_peft_model(model, peft_config) + state_dict = get_peft_model_state_dict( + model=peft_model, + state_dict=None, + adapter_name="default", + ) + + embedding_keys = [n for n in state_dict.keys() if "embed_tokens" in n] + assert embedding_keys == ["base_model.model.model.embed_tokens.token_adapter.trainable_tokens_delta"] + + @pytest.fixture() + def model_weight_untied(self, model): + return model + + @pytest.fixture() + def model_id_weight_tied(self): + return "facebook/opt-125m" + + @pytest.fixture() + def model_weight_tied(self, model_id_weight_tied): + return AutoModelForCausalLM.from_pretrained(model_id_weight_tied) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_weight_tying_noop_when_model_is_untied(self, model_weight_untied, peft_config, tmp_path): + # test if the weight tying is affected as well when we modified the embedding. + assert model_weight_untied._tied_weights_keys + assert not model_weight_untied.config.tie_word_embeddings + + peft_model = get_peft_model(model_weight_untied, peft_config) + assert hasattr(peft_model.model.model.embed_tokens, "token_adapter") + assert not hasattr(peft_model.model.lm_head, "token_adapter") + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ), + ], + ) + def test_weight_tying_applied_when_model_is_tied(self, model_weight_tied, peft_config, tmp_path): + # test if the weight tying is affected as well when we modified the embedding. + assert model_weight_tied._tied_weights_keys + assert model_weight_tied.config.tie_word_embeddings + + peft_model = get_peft_model(model_weight_tied, peft_config) + + # make it so that the input embeddings diverge. when the weights are tied this should + # reflect in the output embeddings as well. + self.simulate_training(peft_model.model.model.decoder.embed_tokens.token_adapter) + + # we have to find out if the input embedding tying is doing its job during forward. + # for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the + # diagonal iff emb_in.weight == emb_out.weight. + token_indices = [0, 1, 2, 3] + emb_dim = 768 + emb_in = peft_model.model.model.decoder.embed_tokens(torch.tensor([token_indices])) + emb_out = peft_model.model.lm_head(1 / emb_in) + + assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float()) + + # make sure that the state dict does not include weight-tied weights. + state_dict = get_peft_model_state_dict(peft_model) + assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)] + + # make sure that merging and unloading restores the weight-tying. + merged_model = peft_model.merge_and_unload() + + assert merged_model.model.decoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr() + + def test_weight_tying_applied_when_model_is_tied_standalone(self, model_weight_tied): + # since weight tying is currently not supported make sure that an error is raised when attempting + # to use a model that has tied input/output embeddings + assert model_weight_tied._tied_weights_keys + assert model_weight_tied.config.tie_word_embeddings + + peft_config = TrainableTokensConfig( + target_modules=["embed_tokens"], + token_indices=[0, 1, 3], + ) + + peft_model = get_peft_model(model_weight_tied, peft_config) + + # make it so that the input embeddings diverge. when the weights are tied this should + # reflect in the output embeddings as well. + self.simulate_training(peft_model.model.model.decoder.embed_tokens) + + # we have to find out if the input embedding tying is doing its job during forward. + # for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the + # diagonal iff emb_in.weight == emb_out.weight. + token_indices = [0, 1, 2, 3] + emb_dim = 768 + emb_in = peft_model.model.model.decoder.embed_tokens(torch.tensor([token_indices])) + emb_out = peft_model.model.lm_head(1 / emb_in) + + assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float()) + + # make sure that the state dict does not include weight-tied weights. + state_dict = get_peft_model_state_dict(peft_model) + assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)] + + # make sure that merging and unloading restores the weight-tying. + merged_model = peft_model.merge_and_unload() + + assert merged_model.model.decoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr() + + def test_weight_tying_normally_issues_warning(self, model_weight_tied, recwarn): + # When using models with weight tying and targeting the embedding or the tied layer should raise a warning. + peft_config = LoraConfig(target_modules=["embed_tokens"]) + peft_model = get_peft_model(model_weight_tied, peft_config) + + warnings = [w.message.args[0] for w in recwarn] + warnings = [msg for msg in warnings if "Model with `tie_word_embeddings=True` and the" in msg] + assert warnings + + def test_weight_tying_state_dict_ignores_tied_weights(self, model_weight_tied): + # since weight tying is currently not supported make sure that an error is raised when attempting + # to use a model that has tied input/output embeddings + assert model_weight_tied._tied_weights_keys + assert model_weight_tied.config.tie_word_embeddings + + peft_config = TrainableTokensConfig( + target_modules=["embed_tokens"], + token_indices=[0, 1, 3], + ) + + peft_model = get_peft_model(model_weight_tied, peft_config) + + state_dict = peft_model.state_dict() + peft_state_dict = get_peft_model_state_dict(peft_model) + + # the state dict or the peft model state dict must not include tied adapter weights + state_dict_keys = [n for n, _ in state_dict.items() if "tied_adapter." in n] + peft_state_dict_keys = [n for n, _ in peft_state_dict.items() if "tied_adapter." in n] + + assert not state_dict_keys + assert not peft_state_dict_keys + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"shared": [0, 1, 3]}, + ), + ], + ) + def test_weight_tying_applied_when_model_is_tied_encoder_decoder(self, peft_config): + model_id = "hf-internal-testing/tiny-random-t5" + base_model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + + peft_model = get_peft_model(base_model, peft_config) + + # make it so that the input embeddings diverge. when the weights are tied this should + # reflect in the output embeddings as well. + self.simulate_training(peft_model.model.shared.token_adapter) + + # we have to find out if the input embedding tying is doing its job during forward. + # for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the + # diagonal iff emb_in.weight == emb_out.weight. + token_indices = [0, 1, 2, 3] + emb_dim = base_model.config.d_model + emb_in = peft_model.model.encoder.embed_tokens(torch.tensor([token_indices])) + emb_out = peft_model.model.lm_head(1 / emb_in) + + assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float()) + + # T5 has a decoder embedding layer, we can simply check if it's forward is equal to the encoder + # embedding forward. + emb_out = peft_model.model.decoder.embed_tokens(torch.tensor([token_indices])) + + assert torch.allclose(emb_in, emb_out) + + # make sure that the state dict does not include weight-tied weights. + state_dict = get_peft_model_state_dict(peft_model) + assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)] + + # make sure that merging and unloading restores the weight-tying. + merged_model = peft_model.merge_and_unload() + + assert merged_model.encoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr() + assert ( + merged_model.encoder.embed_tokens.weight.data_ptr() == merged_model.decoder.embed_tokens.weight.data_ptr() + ) + + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + modules_to_save=["embed_tokens"], + ), + ], + ) + def test_modules_to_save_excludes_trainable_tokens(self, model, peft_config): + with pytest.raises(ValueError) as e: + get_peft_model(model, peft_config) + assert "The embedding layer is already marked to be trained fully" in str(e) + + def test_merge_and_unload_standalone(self, model): + # test basic functionality of merge_and_unload for standalone TrainableTokens + token_indices = [0, 1, 3] + + peft_config = TrainableTokensConfig( + target_modules=["embed_tokens"], + token_indices=token_indices, + ) + + peft_model = get_peft_model(model, peft_config) + + self.simulate_training(peft_model.model.model.embed_tokens) + expected_changed_weights = peft_model.model.model.embed_tokens.trainable_tokens_delta.default.data.clone() + + # make sure no TrainableTokensLayer is in the module + merged_model = peft_model.merge_and_unload() + for _, module in merged_model.named_modules(): + assert not isinstance(module, TrainableTokensLayer) + + # make sure that deltas are applied to the embedding matrix + assert torch.allclose(merged_model.model.embed_tokens.weight.data[token_indices], expected_changed_weights) + + def test_original_module_not_in_state_dict(self, model): + # Every AuxiliaryTrainingWrapper has an original_module attribute. Since the TrainableTokensWrapper is wrapping + # a TrainableTokensLayer and it already has a base layer which serves as the original module, we don't need that + # and so it should not come up in the state dict to save memory. + + peft_config = LoraConfig( + target_modules="all-linear", + trainable_token_indices={"embed_tokens": [0, 1, 3]}, + ) + + peft_model = get_peft_model(model, peft_config) + + # make sure that the original module is present and accessible even though + # we want to exclude it from the state dict. + assert peft_model.model.model.embed_tokens.original_module + + state_dict = get_peft_model_state_dict(peft_model) + + assert not [k for k in state_dict if ".original_module.weight" in k] + + state_dict = peft_model.state_dict() + assert not [k for k in state_dict if ".original_module.weight" in k] + + @pytest.fixture + def model_emb(self): + return ModelEmb() + + @pytest.fixture + def model_embed_in(self): + return ModelEmbedIn() + + @pytest.fixture + def model_embed_in_no_get(self): + return ModelEmbedInNoGet() + + @pytest.fixture + def model_embed_multiple(self): + return ModelEmbedMultiple() + + @pytest.mark.parametrize( + "model_fixture_name, getter", + [ + ("model_emb", lambda model: model.emb), + ("model_embed_in", lambda model: model.embed_in), + ("model", lambda model: model.model.model.embed_tokens), + ], + ) + def test_default_embedding_name_is_inferred_standalone(self, model_fixture_name, getter, request): + # make sure that the auto targeting works when `target_module=None` + base_model = request.getfixturevalue(model_fixture_name) + + peft_config = TrainableTokensConfig(target_modules=None, token_indices=[0, 1, 3]) + peft_model = get_peft_model(base_model, peft_config) + + assert isinstance(getter(peft_model), TrainableTokensLayer) + + @pytest.mark.parametrize( + "model_fixture_name, getter", + [ + ("model_emb", lambda model: model.emb), + ("model_embed_in", lambda model: model.embed_in), + ("model", lambda model: model.model.model.embed_tokens), + ], + ) + def test_default_embedding_name_is_inferred_combined(self, model_fixture_name, getter, request): + # make sure that the auto targeting works when `target_module=None` + base_model = request.getfixturevalue(model_fixture_name) + + peft_config = LoraConfig(target_modules="all-linear", trainable_token_indices=[0, 1, 3]) + peft_model = get_peft_model(base_model, peft_config) + + assert isinstance(getter(peft_model), TrainableTokensWrapper) + + def test_default_embedding_name_cannot_be_inferred(self, model_embed_in_no_get): + # should default to default value `embed_tokens` which is not present in this model + base_model = model_embed_in_no_get + + peft_config = TrainableTokensConfig(target_modules=None, token_indices=[0, 1, 3]) + + with pytest.raises(ValueError) as e: + peft_model = get_peft_model(base_model, peft_config) + + assert "Target modules embed_tokens not found in the base model." in str(e) + + def test_embedding_name_is_used_when_given_standalone(self, model_embed_multiple): + peft_config = TrainableTokensConfig(target_modules="embed_in_2", token_indices=[0, 1, 3]) + peft_model = get_peft_model(model_embed_multiple, peft_config) + + assert isinstance(peft_model.model.embed_in_2, TrainableTokensLayer) + assert not isinstance(peft_model.model.embed_in, TrainableTokensLayer) + + def test_embedding_name_is_used_when_given_combined(self, model_embed_multiple): + peft_config = LoraConfig(target_modules="all-linear", trainable_token_indices={"embed_in_2": [0, 1, 3]}) + peft_model = get_peft_model(model_embed_multiple, peft_config) + + assert isinstance(peft_model.model.embed_in_2, TrainableTokensWrapper) + assert not isinstance(peft_model.model.embed_in, TrainableTokensWrapper) + + @pytest.mark.parametrize("resize_embedding", [True, False]) + @pytest.mark.parametrize( + "peft_config", + [ + LoraConfig(target_modules="all-linear", trainable_token_indices=[1, 2, 3]), + TrainableTokensConfig(target_modules=None, token_indices=[1, 2, 3]), + ], + ) + def test_save_pretrained_auto(self, model, resize_embedding, peft_config, tmp_path): + # make sure that embeddings are saved alongside trainable token weights but only when + # the we detect the embedding to be resized (as detected by save_embedding_layers="auto") + if resize_embedding: + model.resize_token_embeddings(model.config.vocab_size + 2) + peft_model = get_peft_model(model, peft_config) + + peft_model.save_pretrained(tmp_path, save_embedding_layers="auto") + state_dict = safe_load_file(tmp_path / "adapter_model.safetensors") + + if isinstance(peft_config, TrainableTokensConfig): + contains_embedding = "base_model.model.model.embed_tokens.base_layer.weight" in state_dict + else: + contains_embedding = "base_model.model.model.embed_tokens.token_adapter.base_layer.weight" in state_dict + + if resize_embedding: + assert contains_embedding + else: + assert not contains_embedding diff --git a/peft/tests/test_tuners_utils.py b/peft/tests/test_tuners_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..499a4a5502ed765c4b85170fa2a2eae85a45d05b --- /dev/null +++ b/peft/tests/test_tuners_utils.py @@ -0,0 +1,2182 @@ +#!/usr/bin/env python3 + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import re +import unittest +from copy import deepcopy + +import pytest +import torch +from diffusers import StableDiffusionPipeline +from parameterized import parameterized +from torch import nn +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + BitsAndBytesConfig, +) +from transformers.pytorch_utils import Conv1D + +from peft import ( + AdaptionPromptConfig, + IA3Config, + LoHaConfig, + LoraConfig, + PeftModel, + PromptTuningConfig, + VeraConfig, + get_layer_status, + get_model_status, + get_peft_model, +) +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.tuners_utils import ( + BaseTuner, + BaseTunerLayer, + _maybe_include_all_linear_layers, + check_target_module_exists, + inspect_matched_modules, +) +from peft.tuners.tuners_utils import ( + _find_minimal_target_modules as find_minimal_target_modules, +) +from peft.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND, ModulesToSaveWrapper, infer_device +from peft.utils.constants import DUMMY_MODEL_CONFIG, MIN_TARGET_MODULES_FOR_OPTIMIZATION + +from .testing_utils import hub_online_once, require_bitsandbytes, require_non_cpu + + +# Implements tests for regex matching logic common for all BaseTuner subclasses, and +# tests for correct behaviour with different config kwargs for BaseTuners (Ex: feedforward for IA3, etc) and +# tests for utility function to include all linear layers + +REGEX_TEST_CASES = [ + # tuple of + # 1. key + # 2. target_modules + # 3. layers_to_transform + # 4. layers_pattern + # 5. expected result + # some basic examples + ("", [], None, None, False), + ("", ["foo"], None, None, False), + ("foo", [], None, None, False), + ("foo", ["foo"], None, None, True), + ("foo", ["bar"], None, None, False), + ("foo", ["foo", "bar"], None, None, True), + # with regex + ("foo", "foo", None, None, True), + ("foo", ".*oo", None, None, True), + ("foo", "fo.*", None, None, True), + ("foo", ".*bar.*", None, None, False), + ("foobar", ".*oba.*", None, None, True), + # with layers_to_transform + ("foo.bar.1.baz", ["baz"], [1], ["bar"], True), + ("foo.bar.1.baz", ["baz"], [0], ["bar"], False), + ("foo.bar.1.baz", ["baz"], [2], ["bar"], False), + ("foo.bar.10.baz", ["baz"], [0], ["bar"], False), + ("foo.bar.10.baz", ["baz"], [1], ["bar"], False), + ("foo.bar.1.baz", ["baz"], [0, 1, 2], ["bar"], True), + ("foo.bar.1.baz", ["baz", "spam"], [1], ["bar"], True), + ("foo.bar.1.baz", ["baz", "spam"], [0, 1, 2], ["bar"], True), + # empty layers_pattern + ("foo.whatever.1.baz", ["baz"], [1], [], True), + ("foo.whatever.1.baz", ["baz"], [0], [], False), + ("foo.whatever.1.baz", ["baz"], [1], "", True), + ("foo.whatever.1.baz", ["baz"], [0], "", False), + ("foo.whatever.1.baz", ["baz"], [1], None, True), + ("foo.whatever.1.baz", ["baz"], [0], None, False), + # some realistic examples: transformers model + ("transformer.h.1.attn.attention.q_proj.foo", ["q_proj"], None, [], False), + ("transformer.h.1.attn.attention.q_proj", [], None, [], False), + ("transformer.h.1.attn.attention.q_proj", ["q_proj"], None, [], True), + ("transformer.h.1.attn.attention.q_proj", ["q_proj", "v_proj"], None, [], True), + ("transformer.h.1.attn.attention.resid_dropout", ["q_proj", "v_proj"], None, [], False), + ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [1], ["h"], True), + ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [0], ["h"], False), + ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [2], ["h"], False), + ("transformer.h.1.attn.attention.q_proj", ["q_proj"], [0, 1, 2], ["h"], True), + ("transformer.h.1.attn.attention.q_proj", ["q_proj", "v_proj"], [0, 1, 2], ["h"], True), + ("foo.bar.q_proj", ["q_proj"], None, [], True), + ("foo.bar.1.baz", ["baz"], [1], ["foo"], False), + # other corner cases. For ex, below is a case where layers_pattern + # is one of the target nn.modules + ("foo.bar.1.baz", ["baz"], [1], ["baz"], False), + # here, layers_pattern is 'bar', but only keys that contain '.bar' are valid. + ("bar.1.baz", ["baz"], [1], ["bar"], False), + ("foo.bar.001.baz", ["baz"], [1], ["bar"], True), + ("foo.bar.1.spam.2.baz", ["baz"], [1], ["bar"], True), + ("foo.bar.2.spam.1.baz", ["baz"], [1], ["bar"], False), + # some realistic examples: module using nn.Sequential + # for the below test case, key should contain '.blocks' to be valid, because of how layers_pattern is matched + ("blocks.1.weight", ["weight"], [1], ["blocks"], False), + ("blocks.1.bias", ["weight"], [1], ["blocks"], False), + ("mlp.blocks.1.weight", ["weight"], [1], ["blocks"], True), + ("mlp.blocks.1.bias", ["weight"], [1], ["blocks"], False), +] + +MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_CASES = [ + # model_name, model_type, initial_target_modules, expected_target_modules + # test for a causal Llama model + ( + "HuggingFaceH4/tiny-random-LlamaForCausalLM", + "causal", + INCLUDE_LINEAR_LAYERS_SHORTHAND, + ["k_proj", "v_proj", "q_proj", "o_proj", "down_proj", "up_proj", "gate_proj"], + ), + # test for a Llama model without the LM head + ( + "HuggingFaceH4/tiny-random-LlamaForCausalLM", + "base", + INCLUDE_LINEAR_LAYERS_SHORTHAND, + ["k_proj", "v_proj", "q_proj", "o_proj", "down_proj", "up_proj", "gate_proj"], + ), + # test for gpt2 with Conv1D layers + ("hf-internal-testing/tiny-random-gpt2", "causal", INCLUDE_LINEAR_LAYERS_SHORTHAND, ["c_attn", "c_proj", "c_fc"]), + # test for T5 model + ( + "hf-internal-testing/tiny-random-t5", + "seq2seq", + INCLUDE_LINEAR_LAYERS_SHORTHAND, + ["k", "q", "v", "o", "wi", "wo"], + ), + # test for GPTNeoX. output module list should exclude classification head - which is named as "embed_out" instead of the usual "lm_head" for GPTNeoX + ( + "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", + "causal", + INCLUDE_LINEAR_LAYERS_SHORTHAND, + ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], + ), +] + +# tests for a few args that should remain unchanged +MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_INTERNALS = [ + # initial_target_modules, expected_target_modules + (["k_proj"], ["k_proj"]), + # test with target_modules as None + (None, None), + # test with target_modules as a regex expression + (".*(q_proj|v_proj)$", ".*(q_proj|v_proj)$"), +] + +BNB_QUANTIZATIONS = [("4bit",), ("8bit",)] +BNB_TEST_CASES = [(x + y) for x in MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_CASES for y in BNB_QUANTIZATIONS] + + +class PeftCustomKwargsTester(unittest.TestCase): + r""" + Test if the PeftModel is instantiated with correct behaviour for custom kwargs. This includes: + - test if regex matching works correctly + - test if adapters handle custom kwargs the right way e.g. IA3 for `feedforward_modules` + + """ + + transformers_class_map = {"causal": AutoModelForCausalLM, "seq2seq": AutoModelForSeq2SeqLM, "base": AutoModel} + + @parameterized.expand(REGEX_TEST_CASES) + def test_regex_matching_valid(self, key, target_modules, layers_to_transform, layers_pattern, expected_result): + # We use a LoRA Config for testing, but the regex matching function is common for all BaseTuner subclasses. + # example model_id for config initialization. key is matched only against the target_modules given, so this can be any model + model_id = "peft-internal-testing/tiny-OPTForCausalLM-lora" + config = LoraConfig( + base_model_name_or_path=model_id, + target_modules=target_modules, + layers_pattern=layers_pattern, + layers_to_transform=layers_to_transform, + ) + actual_result = bool(check_target_module_exists(config, key)) + assert actual_result == expected_result + + def test_module_matching_lora(self): + # peft models that have a module matching method to inspect the matching modules to allow + # users to easily debug their configuration. Here we only test a single case, not all possible combinations of + # configs that could exist. This is okay as the method calls `check_target_module_exists` internally, which + # has been extensively tested above. + model_id = "hf-internal-testing/tiny-random-BloomForCausalLM" + with hub_online_once(model_id): + model = AutoModel.from_pretrained(model_id) + # by default, this model matches query_key_value + config = LoraConfig() + peft_model = get_peft_model(model, config) + + output = inspect_matched_modules(peft_model) # inspects default adapter for peft_model + matched = output["matched"] + expected = [ + "h.0.self_attention.query_key_value", + "h.1.self_attention.query_key_value", + "h.2.self_attention.query_key_value", + "h.3.self_attention.query_key_value", + "h.4.self_attention.query_key_value", + ] + assert matched == expected # module lists should match exactly + + # no overlap with matched modules + unmatched = output["unmatched"] + for key in expected: + assert key not in unmatched + + def test_feedforward_matching_ia3(self): + model_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration" + with hub_online_once(model_id): + model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + # simple example for just one t5 block for testing + config_kwargs = { + "target_modules": ".*encoder.*block.0.*(SelfAttention|EncDecAttention|DenseReluDense).(k|q|v|wo|wi)$", + "feedforward_modules": ["wo", "wi"], + } + config = IA3Config(base_model_name_or_path=model_id, **config_kwargs) + peft_model = get_peft_model(model, config) + output = inspect_matched_modules(peft_model) # inspects default adapter for peft_model + matched = output["matched"] + expected = [ + "encoder.block.0.layer.0.SelfAttention.q", + "encoder.block.0.layer.0.SelfAttention.k", + "encoder.block.0.layer.0.SelfAttention.v", + "encoder.block.0.layer.1.DenseReluDense.wi", + "encoder.block.0.layer.1.DenseReluDense.wo", + ] + expected_feedforward = [ + "encoder.block.0.layer.1.DenseReluDense.wi", + "encoder.block.0.layer.1.DenseReluDense.wo", + ] + assert matched == expected # not required since we do similar checks above, but just to be sure + module_dict = dict(model.named_modules()) + for key in matched: + module = module_dict[key] + if key in expected_feedforward: + assert module.is_feedforward + else: # other IA3 modules should not be marked as feedforward + assert not module.is_feedforward + + @parameterized.expand(MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_CASES) + def test_maybe_include_all_linear_layers_lora( + self, model_id, model_type, initial_target_modules, expected_target_modules + ): + with hub_online_once(model_id): + model = self.transformers_class_map[model_type].from_pretrained(model_id) + config_cls = LoraConfig + self._check_match_with_expected_target_modules( + model_id, model, config_cls, initial_target_modules, expected_target_modules + ) + + @parameterized.expand(BNB_TEST_CASES) + @require_non_cpu + @require_bitsandbytes + def test_maybe_include_all_linear_layers_lora_bnb( + self, model_id, model_type, initial_target_modules, expected_target_modules, quantization + ): + if quantization == "4bit": + config_kwargs = {"quantization_config": BitsAndBytesConfig(load_in_4bit=True)} + elif quantization == "8bit": + config_kwargs = {"quantization_config": BitsAndBytesConfig(load_in_8bit=True)} + + with hub_online_once(model_id): + model = self.transformers_class_map[model_type].from_pretrained( + model_id, device_map="auto", **config_kwargs + ) + config_cls = LoraConfig + self._check_match_with_expected_target_modules( + model_id, model, config_cls, initial_target_modules, expected_target_modules + ) + + def _check_match_with_expected_target_modules( + self, model_id, model, config_cls, initial_target_modules, expected_target_modules + ): + """ + Helper function for the test for `_maybe_include_all_linear_layers` + """ + actual_config = config_cls(base_model_name_or_path=model_id, target_modules=initial_target_modules) + expected_config = config_cls(base_model_name_or_path=model_id, target_modules=expected_target_modules) + model_copy = deepcopy(model) + actual_model = get_peft_model(model, peft_config=actual_config) + expected_model = get_peft_model(model_copy, peft_config=expected_config) + expected_model_module_dict = dict(expected_model.named_modules()) + # compare the two models and assert that all layers are of the same type + for name, actual_module in actual_model.named_modules(): + expected_module = expected_model_module_dict[name] + assert type(actual_module) is type(expected_module) + + def test_maybe_include_all_linear_layers_ia3_loha(self): + model_id, initial_target_modules, expected_target_modules = ( + "HuggingFaceH4/tiny-random-LlamaForCausalLM", + INCLUDE_LINEAR_LAYERS_SHORTHAND, + ["k_proj", "v_proj", "q_proj", "o_proj", "down_proj", "up_proj", "gate_proj"], + ) + with hub_online_once(model_id): + model_ia3 = AutoModelForCausalLM.from_pretrained(model_id) + model_loha = deepcopy(model_ia3) + config_classes = [IA3Config, LoHaConfig] + models = [model_ia3, model_loha] + for config_cls, model in zip(config_classes, models): + self._check_match_with_expected_target_modules( + model_id, model, config_cls, initial_target_modules, expected_target_modules + ) + + @parameterized.expand(MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_INTERNALS) + def test_maybe_include_all_linear_layers_internals(self, initial_target_modules, expected_target_modules): + model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig(base_model_name_or_path=model_id, target_modules=initial_target_modules) + new_config = _maybe_include_all_linear_layers(config, model) + if isinstance(expected_target_modules, list): + # assert that expected and actual target_modules have the same items + assert set(new_config.target_modules) == set(expected_target_modules) + else: + assert new_config.target_modules == expected_target_modules + + def test_maybe_include_all_linear_layers_diffusion(self): + model_id = "hf-internal-testing/tiny-sd-pipe" + with hub_online_once(model_id): + model = StableDiffusionPipeline.from_pretrained(model_id) + config = LoraConfig(base_model_name_or_path=model_id, target_modules="all-linear") + + # all linear layers should be converted + num_linear = sum(isinstance(module, (nn.Linear, Conv1D)) for module in model.unet.modules()) + model.unet = get_peft_model(model.unet, config) + num_lora = sum(isinstance(module, LoraLayer) for module in model.unet.modules()) + assert num_lora == num_linear + + def test_maybe_include_all_linear_does_not_target_classifier_head(self): + # See issue 2027 + # Ensure that if a SEQ_CLS model is being used with target_modules="all-linear", the classification head is not + # targeted by the adapter layer. + model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=10) + # sanity check + assert isinstance(model.score, nn.Linear) + + num_linear = sum(isinstance(module, (nn.Linear, Conv1D)) for module in model.modules()) + + config = LoraConfig(task_type="SEQ_CLS", target_modules="all-linear") + model = get_peft_model(model, config) + assert isinstance(model.base_model.score, ModulesToSaveWrapper) + + # the bug was that these were lora.Linear instances + assert isinstance(model.base_model.score.original_module, nn.Linear) + assert isinstance(model.base_model.score.modules_to_save["default"], nn.Linear) + + # ensure that all but one linear layer was targeted by LoRA + num_lora = sum(isinstance(module, LoraLayer) for module in model.modules()) + assert num_lora == num_linear - 1 + + @parameterized.expand(MAYBE_INCLUDE_ALL_LINEAR_LAYERS_TEST_CASES) + def test_all_linear_nested_targets_correct_layers( + self, model_id, model_type, initial_target_modules, expected_target_modules + ): + # See 2390 + # Ensure that if adapter layers are already applied, we don't get nested adapter layers (e.g. LoRA targeting the + # lora_A, lora_B layers) + with hub_online_once(model_id): + model = self.transformers_class_map[model_type].from_pretrained(model_id) + config_cls = LoraConfig + self._check_match_with_expected_target_modules( + model_id, model, config_cls, initial_target_modules, expected_target_modules + ) + # re-use the same model, i.e. the adapter is already applied + self._check_match_with_expected_target_modules( + model_id, model, config_cls, initial_target_modules, expected_target_modules + ) + + def test_add_second_adapter_with_all_linear_works(self): + # See 2390 Similar test to test_all_linear_nested_targets_correct_layers above, but using add_adapter instead of + # calling get_peft_model in an already adapted model + model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + + # important: don't reuse the first config, since config.target_modules will be overwritten, which would make the + # test pass trivially. + config0 = LoraConfig(target_modules=INCLUDE_LINEAR_LAYERS_SHORTHAND) + config1 = LoraConfig(target_modules=INCLUDE_LINEAR_LAYERS_SHORTHAND) + + model = get_peft_model(model, config0) + model.add_adapter(adapter_name="other", peft_config=config1) + + # both configs should result in the same target modules being chosen (remember that config.target_modules will + # be replaced by the actual set of target_modules) + assert config0.target_modules == config1.target_modules + + for layer in model.base_model.model.model.layers: + projs = ( + layer.self_attn.q_proj, + layer.self_attn.v_proj, + layer.self_attn.k_proj, + layer.mlp.gate_proj, + layer.mlp.up_proj, + layer.mlp.down_proj, + ) + for proj in projs: + # the targted layer itself, which in the base model was the nn.Linear layer, is now a LoraLayer + assert isinstance(proj, LoraLayer) + # all children of that layer are still normal nn.Linear layers + assert isinstance(proj.base_layer, nn.Linear) + assert isinstance(proj.lora_A["default"], nn.Linear) + assert isinstance(proj.lora_B["default"], nn.Linear) + assert isinstance(proj.lora_A["other"], nn.Linear) + assert isinstance(proj.lora_B["other"], nn.Linear) + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.drop = nn.Dropout(0.5) + self.lin1 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + +class TestTargetedModuleNames(unittest.TestCase): + """Check that the attribute targeted_module_names is correctly set. + + This checks LoRA and IA³, but this should be sufficient, testing all other tuners is not necessary. + """ + + def test_one_targeted_module_regex(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules="lin0")) + assert model.targeted_module_names == ["lin0"] + + def test_two_targeted_module_regex(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules="lin.*")) + assert model.targeted_module_names == ["lin0", "lin1"] + + def test_one_targeted_module_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules=["lin0"])) + assert model.targeted_module_names == ["lin0"] + + def test_two_targeted_module_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules=["lin0", "lin1"])) + assert model.targeted_module_names == ["lin0", "lin1"] + + def test_ia3_targeted_module_regex(self): + model = MLP() + model = get_peft_model(model, IA3Config(target_modules=".*lin.*", feedforward_modules=".*lin.*")) + assert model.targeted_module_names == ["lin0", "lin1"] + + def test_ia3_targeted_module_list(self): + model = MLP() + model = get_peft_model(model, IA3Config(target_modules=["lin0", "lin1"], feedforward_modules=["lin0", "lin1"])) + assert model.targeted_module_names == ["lin0", "lin1"] + + def test_realistic_example(self): + model_id = "hf-internal-testing/tiny-random-BloomForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig(task_type="CAUSAL_LM") + model = get_peft_model(model, config) + expected = [ + f"transformer.h.{i}.self_attention.query_key_value" for i in range(len(model.base_model.transformer.h)) + ] + assert model.targeted_module_names == expected + + +class TestTargetedParameterNames(unittest.TestCase): + """Check that the attribute targeted_parameter_names (via target_parameters) is correctly set. + + This is only implemented for LoRA. Regex matching is currently not implemented. + """ + + def test_one_targeted_parameters_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_parameters=["lin0.weight"])) + assert model.targeted_parameter_names == ["lin0.weight"] + + def test_two_targeted_parameters_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_parameters=["lin0.weight", "lin1.weight"])) + assert model.targeted_parameter_names == ["lin0.weight", "lin1.weight"] + + def test_realistic_example(self): + model_id = "trl-internal-testing/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig(target_modules=[], task_type="CAUSAL_LM", target_parameters=["v_proj.weight"]) + model = get_peft_model(model, config) + expected = [ + f"model.layers.{i}.self_attn.v_proj.weight" for i in range(len(model.base_model.model.model.layers)) + ] + assert model.targeted_parameter_names == expected + + +class TestExcludedModuleNames(unittest.TestCase): + """Check that the attribute exclude_module is correctly set. + + This checks LoRA and IA³, but this should be sufficient, testing all other tuners is not necessary. + """ + + def test_two_excluded_module_regex(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules=("lin.*"), exclude_modules="lin0")) + assert model.targeted_module_names == ["lin1"] + + def test_two_excluded_module_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules=["lin0", "lin1"], exclude_modules="lin0")) + assert model.targeted_module_names == ["lin1"] + + def test_multiple_excluded_modules_list(self): + model = MLP() + model = get_peft_model(model, LoraConfig(target_modules=["lin0", "lin1"], exclude_modules=["lin0"])) + assert model.targeted_module_names == ["lin1"] + + def test_ia3_two_excluded_module_regex(self): + model = MLP() + model = get_peft_model( + model, IA3Config(target_modules=".*lin.*", feedforward_modules=".*lin.*", exclude_modules="lin0") + ) + assert model.targeted_module_names == ["lin1"] + + def test_ia3_multiple_excluded_modules_list(self): + model = MLP() + model = get_peft_model( + model, IA3Config(target_modules=["lin0", "lin1"], feedforward_modules=".*lin.*", exclude_modules=["lin1"]) + ) + assert model.targeted_module_names == ["lin0"] + + def test_all_modules_excluded(self): + model = MLP() + with pytest.raises(ValueError, match="All modules were excluded"): + get_peft_model( + model, + LoraConfig( + target_modules=["lin0", "lin1", "relu", "drop", "sm"], + exclude_modules=["lin0", "lin1", "relu", "drop", "sm"], + ), + ) + + def test_no_modules_matched(self): + model = MLP() + with pytest.raises(ValueError, match="Target modules .* not found in the base model"): + get_peft_model(model, LoraConfig(target_modules=["non_existent_module"])) + + def test_some_modules_excluded_some_unmatched(self): + model = MLP() + with pytest.raises(ValueError, match="No modules were targeted for adaptation"): + get_peft_model(model, LoraConfig(target_modules=["lin0", "non_existent_module"], exclude_modules=["lin0"])) + + def test_exclude_modules_not_used(self): + model = MLP() + with pytest.warns(UserWarning, match="You have passed exclude_modules=.* but no modules were excluded"): + get_peft_model(model, LoraConfig(target_modules=["lin1"], exclude_modules=["non_existent_module"])) + + def test_realistic_example(self): + model_id = "hf-internal-testing/tiny-random-BloomForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + config = LoraConfig(task_type="CAUSAL_LM", exclude_modules="transformer.h.2.self_attention.query_key_value") + model = get_peft_model(model, config) + expected = [ + f"transformer.h.{i}.self_attention.query_key_value" + for i in range(len(model.base_model.transformer.h)) + if i != 2 + ] + assert model.targeted_module_names == expected + + +class TestModelAndLayerStatus: + """Check the methods `get_layer_status` and `get_model_status`.` + + Note that we only test LoRA here but the same logic should work for other tuner types (if they support the + corresponding features like merging). + + """ + + torch_device = infer_device() + + @pytest.fixture + def small_base_model_cls(self): + class SmallModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.lin1 = nn.Linear(10, 10) + + return SmallModel + + @pytest.fixture + def small_base_emb_model_cls(self): + class SmallEmbModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.emb = nn.Embedding(10, 10) + + return SmallEmbModel + + @pytest.fixture + def small_model(self, small_base_model_cls): + config = LoraConfig(target_modules="lin0") + return get_peft_model(small_base_model_cls(), config) + + @pytest.fixture + def large_model(self): + class LargeModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.conv0 = nn.Conv2d(3, 10, 3) + self.emb0 = nn.Embedding(10, 10) + self.lin1 = nn.Linear(10, 10) + self.conv1 = nn.Conv2d(3, 10, 3) + self.emb1 = nn.Embedding(10, 10) + + config0 = LoraConfig(target_modules=["lin0", "conv1", "emb0"]) + config1 = LoraConfig(target_modules=["lin0", "lin1"], r=16) + model = get_peft_model(LargeModel(), config0) + model.add_adapter("other", config1) + return model + + ################ + # layer status # + ################ + + def test_layer_names_small(self, small_model): + layer_status = small_model.get_layer_status() + expected = ["model.lin0"] + assert [status.name for status in layer_status] == expected + + def test_layer_names_large(self, large_model): + layer_status = large_model.get_layer_status() + result = sorted([status.name for status in layer_status]) + expected = ["model.conv1", "model.emb0", "model.lin0", "model.lin1"] + assert result == expected + + def test_module_type_small(self, small_model): + layer_status = small_model.get_layer_status() + assert [status.module_type for status in layer_status] == ["lora.Linear"] + + def test_module_type_large(self, large_model): + layer_status = large_model.get_layer_status() + result = sorted([status.module_type for status in layer_status]) + expected = ["lora.Conv2d", "lora.Embedding", "lora.Linear", "lora.Linear"] + assert result == expected + + def test_enabled_small(self, small_model): + layer_status = small_model.get_layer_status() + assert [status.enabled for status in layer_status] == [True] + + def test_enabled_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.enabled for status in layer_status] + expected = [True, True, True, True] + assert result == expected + + def test_enabled_irregular(self, large_model): + # this is an invalid state, but we should still test it + # disable a single layer + for module in large_model.modules(): + if isinstance(module, BaseTunerLayer): + module.enable_adapters(False) + break + + layer_status = large_model.get_layer_status() + result = [status.enabled for status in layer_status] + expected = [False, True, True, True] + assert result == expected + + def test_active_adapters_small(self, small_model): + layer_status = small_model.get_layer_status() + assert [status.active_adapters for status in layer_status] == [["default"]] + + def test_active_adapters_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.active_adapters for status in layer_status] + # note: as currently implemented, the active adapter can be an adapter that does not exist on this specific + # layer, for instance, layer 3 (i.e. index 2) only has the "other" adapter but "default" is still shown as the + # active adapter + expected = [["default"], ["default"], ["default"], ["default"]] + assert result == expected + + # switch to "other" + large_model.set_adapter("other") + layer_status = large_model.get_layer_status() + result = [status.active_adapters for status in layer_status] + expected = [["other"], ["other"], ["other"], ["other"]] + + def test_merge_adapters_small(self, small_model): + layer_status = small_model.get_layer_status() + assert [status.merged_adapters for status in layer_status] == [[]] + assert [status.available_adapters for status in layer_status] == [["default"]] + + # now merge "default" + small_model.merge_adapter(["default"]) + layer_status = small_model.get_layer_status() + assert [status.merged_adapters for status in layer_status] == [["default"]] + assert [status.available_adapters for status in layer_status] == [["default"]] + + def test_merge_adapters_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.merged_adapters for status in layer_status] + assert result == [[], [], [], []] + + # now merge "default" + large_model.merge_adapter(["default"]) + layer_status = large_model.get_layer_status() + result = [status.merged_adapters for status in layer_status] + # default is on layer 0, 1, and 3 + assert result == [["default"], ["default"], [], ["default"]] + + # now merge "other" + large_model.unmerge_adapter() + large_model.merge_adapter(["other"]) + layer_status = large_model.get_layer_status() + result = [status.merged_adapters for status in layer_status] + # other is on layer 0 and 2 + assert result == [["other"], [], ["other"], []] + + # now merge both + large_model.merge_adapter(["default", "other"]) + layer_status = large_model.get_layer_status() + result = [status.merged_adapters for status in layer_status] + # default is on layer 0, 1, and 3, other is on layer 0 and 2 + assert result == [["other", "default"], ["default"], ["other"], ["default"]] + + def test_requires_grad_small(self, small_model): + layer_status = small_model.get_layer_status() + assert [status.requires_grad for status in layer_status] == [{"default": True}] + + def test_requires_grad_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.requires_grad for status in layer_status] + # default is on layer 0, 1, and 3, other is on layer 0 and 2 + expected = [{"default": True, "other": False}, {"default": True}, {"other": False}, {"default": True}] + assert result == expected + + # now activate "other" + large_model.set_adapter("other") + layer_status = large_model.get_layer_status() + result = [status.requires_grad for status in layer_status] + expected = [{"default": False, "other": True}, {"default": False}, {"other": True}, {"default": False}] + assert result == expected + + # change requires grad, is now inconsistent with active/inactive adapter + large_model.set_requires_grad("default", requires_grad=True) + large_model.set_requires_grad("other", requires_grad=False) + layer_status = large_model.get_layer_status() + result = [status.requires_grad for status in layer_status] + expected = [{"default": True, "other": False}, {"default": True}, {"other": False}, {"default": True}] + assert result == expected + + def test_requires_grad_irregular(self, large_model): + # inject an embedding layer with requires_grad=False + # this is an invalid state, but we should still test it + lora_embedding_A = nn.Parameter(torch.zeros(10, 10)) + lora_embedding_B = nn.Parameter(torch.zeros(10, 10)) + lora_embedding_A.requires_grad = False + lora_embedding_B.requires_grad = False + large_model.base_model.model.lin0.lora_embedding_A["default"] = lora_embedding_A + large_model.base_model.model.lin0.lora_embedding_B["default"] = lora_embedding_B + + layer_status = large_model.get_layer_status() + result = [status.requires_grad for status in layer_status] + expected = [{"default": "irregular", "other": False}, {"default": True}, {"other": False}, {"default": True}] + assert result == expected + + def test_available_adapters_small(self, small_model): + layer_status = small_model.get_layer_status() + result = [status.available_adapters for status in layer_status] + expected = [["default"]] + assert result == expected + + def test_available_adapters_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.available_adapters for status in layer_status] + expected = [["default", "other"], ["default"], ["other"], ["default"]] + assert result == expected + + def test_devices_all_cpu_small(self, small_model): + layer_status = small_model.get_layer_status() + result = [status.devices for status in layer_status] + expected = [{"default": ["cpu"]}] + assert result == expected + + def test_devices_all_cpu_large(self, large_model): + layer_status = large_model.get_layer_status() + result = [status.devices for status in layer_status] + expected = [ + {"default": ["cpu"], "other": ["cpu"]}, + {"default": ["cpu"]}, + {"other": ["cpu"]}, + {"default": ["cpu"]}, + ] + assert result == expected + + def test_with_modules_to_save(self, small_base_model_cls): + # check that modules_to_save are correctly reported in layer status + model = small_base_model_cls() + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config) + layer_status = model.get_layer_status() + + assert len(layer_status) == 2 + status = layer_status[1] # for modules_to_save + + assert status.name == "model.lin1" + assert status.module_type == "ModulesToSaveWrapper" + assert status.enabled is True + assert status.active_adapters == ["default"] + assert status.merged_adapters == [] + assert status.available_adapters == ["default"] + assert status.requires_grad == {"default": True} + assert status.devices == {"default": ["cpu"]} + + def test_with_trainable_tokens(self, small_base_emb_model_cls): + # check that trainable_token_indices are correctly reported in layer status + model = small_base_emb_model_cls() + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0, 1, 2]}) + model = get_peft_model(model, config) + layer_status = model.get_layer_status() + + assert len(layer_status) == 2 + status = layer_status[1] # for trainable tokens + + assert status.name == "model.emb.token_adapter" + assert status.module_type == "TrainableTokensLayer" + assert status.enabled is True + assert status.active_adapters == ["default"] + assert status.merged_adapters == [] + assert status.available_adapters == ["default"] + assert status.requires_grad == {"default": True} + assert status.devices == {"default": ["cpu"]} + + @require_non_cpu + def test_devices_all_gpu_large(self, large_model): + large_model.to(self.torch_device) + layer_status = large_model.get_layer_status() + result = [status.devices for status in layer_status] + expected = [ + {"default": [self.torch_device], "other": [self.torch_device]}, + {"default": [self.torch_device]}, + {"other": [self.torch_device]}, + {"default": [self.torch_device]}, + ] + assert result == expected + + @require_non_cpu + def test_devices_cpu_and_gpu_large(self, large_model): + # move the embedding layer to GPU + large_model.model.lin0.lora_A["default"] = large_model.model.lin0.lora_A["default"].to(self.torch_device) + layer_status = large_model.get_layer_status() + result = [status.devices for status in layer_status] + expected = [ + {"default": ["cpu", self.torch_device], "other": ["cpu"]}, + {"default": ["cpu"]}, + {"other": ["cpu"]}, + {"default": ["cpu"]}, + ] + assert result == expected + + def test_target_parameters(self, large_model): + # don't check each attribute, just the relevant ones + # first remove the normal LoRA layers + large_model = large_model.merge_and_unload() + config = LoraConfig(target_parameters=["lin0.weight", "lin1.weight"]) + large_model = get_peft_model(large_model, config) + layer_status = large_model.get_layer_status() + assert [status.name for status in layer_status] == ["model.lin0", "model.lin1"] + assert [status.module_type for status in layer_status] == ["lora.ParamWrapper"] * 2 + + def test_target_parameters_and_target_modules(self, large_model): + # don't check each attribute, just the relevant ones + # first remove the normal LoRA layers + large_model = large_model.merge_and_unload() + config = LoraConfig(target_parameters=["lin0.weight"], target_modules=["lin1"]) + large_model = get_peft_model(large_model, config) + layer_status = large_model.get_layer_status() + assert [status.name for status in layer_status] == ["model.lin0", "model.lin1"] + assert [status.module_type for status in layer_status] == ["lora.ParamWrapper", "lora.Linear"] + + ################ + # model status # + ################ + + def test_base_model_type_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.base_model_type == "SmallModel" + + def test_base_model_type_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.base_model_type == "LargeModel" + + def test_base_model_type_transformers_automodel(self): + # ensure that this also works with transformers AutoModels + model_id = "google/flan-t5-small" + with hub_online_once(model_id): + model = AutoModel.from_pretrained(model_id) + model = get_peft_model(model, LoraConfig()) + model_status = model.get_model_status() + assert model_status.base_model_type == "T5Model" + + def test_adapter_model_type_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.adapter_model_type == "LoraModel" + + def test_adapter_model_type_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.adapter_model_type == "LoraModel" + + def test_peft_types_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.peft_types == {"default": "LORA"} + + def test_peft_types_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.peft_types == {"default": "LORA", "other": "LORA"} + + def test_nb_params_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.trainable_params == 160 + assert model_status.total_params == 380 + + def test_nb_params_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.trainable_params == 616 + assert model_status.total_params == 2236 + + def test_num_adapter_layers_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.num_adapter_layers == 1 + + def test_num_adapter_layers_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.num_adapter_layers == 4 + + def test_model_enabled_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.enabled is True + + def test_model_enabled_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.enabled is True + + def test_model_disabled_small(self, small_model): + small_model.disable_adapter_layers() + model_status = small_model.get_model_status() + assert model_status.enabled is False + + def test_model_disabled_large(self, large_model): + large_model.disable_adapter_layers() + model_status = large_model.get_model_status() + assert model_status.enabled is False + + def test_model_enabled_irregular(self, large_model): + # this is an invalid state, but we should still test it + # disable a single layer + for module in large_model.modules(): + if isinstance(module, BaseTunerLayer): + module.enable_adapters(False) + break + + model_status = large_model.get_model_status() + assert model_status.enabled == "irregular" + + def test_model_enabled_irregular_with_modules_to_save(self, small_base_model_cls): + # check that modules_to_save are correctly reported in layer status + model = small_base_model_cls() + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config) + + # disable only lin0 + model.lin0.enable_adapters(False) + + model_status = model.get_model_status() + # since lin1 is still enabled, the overall model status is "irregular" + assert model_status.enabled == "irregular" + + def test_model_enabled_irregular_with_trainable_tokens(self, small_base_emb_model_cls): + # check that trainable_token_indices are correctly reported in layer status + model = small_base_emb_model_cls() + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0, 1, 2]}) + model = get_peft_model(model, config) + + # disable only lin0 + model.lin0.enable_adapters(False) + + model_status = model.get_model_status() + # since emb is still enabled, the overall model status is "irregular" + assert model_status.enabled == "irregular" + + def test_model_active_adapters_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.active_adapters == ["default"] + + def test_model_active_adapters_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.active_adapters == ["default"] + + large_model.set_adapter("other") + model_status = large_model.get_model_status() + assert model_status.active_adapters == ["other"] + + def test_model_active_adapters_irregular(self, large_model): + # this is an invalid state, but we should still test it + # disable a single layer + for module in large_model.modules(): + if isinstance(module, BaseTunerLayer): + # switch a single layer's active adapter from default to other + if module.active_adapters == ["default"]: + module._active_adapter = "other" + assert module.active_adapters == ["other"] + break + + model_status = large_model.get_model_status() + assert model_status.active_adapters == "irregular" + + def test_model_active_adapters_with_modules_to_save_irregular(self, small_base_model_cls): + # check that modules_to_save are correctly reported in layer status + model = small_base_model_cls() + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config) + model.add_adapter("other", config) + + # switch modules_to_save to "other" + model.lin1.set_adapter("other") + + model_status = model.get_model_status() + # since lin0 is still on "default", the overall model status is "irregular" + assert model_status.active_adapters == "irregular" + + def test_model_active_adapters_with_trainable_tokens_irregular(self, small_base_emb_model_cls): + # check that trainable_token_indices are correctly reported in layer status + model = small_base_emb_model_cls() + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0, 1, 2]}) + model = get_peft_model(model, config) + model.add_adapter("other", config) + + # switch trainable tokens to "other" + model.emb.set_adapter("other") + + model_status = model.get_model_status() + # since lin0 is still on "default", the overall model status is "irregular" + assert model_status.active_adapters == "irregular" + + def test_model_merged_adapters_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.merged_adapters == [] + + small_model.merge_adapter() + model_status = small_model.get_model_status() + assert model_status.merged_adapters == ["default"] + + small_model.unmerge_adapter() + model_status = small_model.get_model_status() + assert model_status.merged_adapters == [] + + def test_model_merged_adapters_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.merged_adapters == [] + + large_model.merge_adapter(["default"]) + model_status = large_model.get_model_status() + assert model_status.merged_adapters == ["default"] + + large_model.unmerge_adapter() + large_model.merge_adapter(["other"]) + model_status = large_model.get_model_status() + assert model_status.merged_adapters == ["other"] + + large_model.unmerge_adapter() + large_model.merge_adapter(["default", "other"]) + model_status = large_model.get_model_status() + assert model_status.merged_adapters == ["default", "other"] + + def test_model_merged_adapters_irregular(self, large_model): + # this is an invalid state, but we should still test it + # by merging only lin0 of "default", we end up in a irregular state, because not all "default" layers are merged + large_model.base_model.lin0.merge(["default"]) + + model_status = large_model.get_model_status() + assert model_status.merged_adapters == "irregular" + + def test_model_requires_grad_model_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.requires_grad == {"default": True} + + def test_model_requires_grad_model_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.requires_grad == {"default": True, "other": False} + + large_model.set_adapter("other") + model_status = large_model.get_model_status() + assert model_status.requires_grad == {"default": False, "other": True} + + # change requires grad, is now inconsistent with active/inactive adapter + large_model.set_requires_grad("default", requires_grad=True) + large_model.set_requires_grad("other", requires_grad=False) + model_status = large_model.get_model_status() + assert model_status.requires_grad == {"default": True, "other": False} + + def test_model_requires_grad_model_irregular(self, large_model): + # inject an embedding layer with requires_grad=False + # this is an invalid state, but we should still test it + lora_embedding_A = nn.Parameter(torch.zeros(10, 10)) + lora_embedding_B = nn.Parameter(torch.zeros(10, 10)) + lora_embedding_A.requires_grad = False + lora_embedding_B.requires_grad = False + large_model.base_model.model.lin0.lora_embedding_A["default"] = lora_embedding_A + large_model.base_model.model.lin0.lora_embedding_B["default"] = lora_embedding_B + + model_status = large_model.get_model_status() + assert model_status.requires_grad == {"default": "irregular", "other": False} + + def test_model_requires_irregular_with_modules_to_save(self, small_base_model_cls): + # check that modules_to_save are correctly reported in layer status + model = small_base_model_cls() + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config) + + # set modules_to_save to requires_grad=False + model.lin1.modules_to_save.default.weight.requires_grad = False + + model_status = model.get_model_status() + # since lin1 is still requires_grad=True, the overall model status is "irregular" + assert model_status.requires_grad == {"default": "irregular"} + + def test_model_requires_irregular_with_trainable_tokens(self, small_base_emb_model_cls): + # check that trainable_token_indices are correctly reported in layer status + model = small_base_emb_model_cls() + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0, 1, 2]}) + model = get_peft_model(model, config) + + # set trainable tokens to requires_grad=False + model.emb.token_adapter.trainable_tokens_delta.default.requires_grad = False + + model_status = model.get_model_status() + # since emb is still requires_grad=True, the overall model status is "irregular" + assert model_status.requires_grad == {"default": "irregular"} + + def test_model_available_adapters_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.available_adapters == ["default"] + + def test_model_available_adapters_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.available_adapters == ["default", "other"] + + def test_model_devices_all_cpu_small(self, small_model): + model_status = small_model.get_model_status() + assert model_status.devices == {"default": ["cpu"]} + + def test_model_devices_all_cpu_large(self, large_model): + model_status = large_model.get_model_status() + assert model_status.devices == {"default": ["cpu"], "other": ["cpu"]} + + @require_non_cpu + def test_model_devices_all_gpu_large(self, large_model): + large_model.to(self.torch_device) + model_status = large_model.get_model_status() + assert model_status.devices == {"default": [self.torch_device], "other": [self.torch_device]} + + @require_non_cpu + def test_model_devices_cpu_and_gpu_large(self, large_model): + # move the embedding layer to GPU + large_model.model.lin0.lora_A["default"] = large_model.model.lin0.lora_A["default"].to(self.torch_device) + model_status = large_model.get_model_status() + assert model_status.devices == {"default": ["cpu", self.torch_device], "other": ["cpu"]} + + def test_model_target_parameters(self, large_model): + # don't check each attribute, just the relevant ones + # first remove the normal LoRA layers + large_model = large_model.merge_and_unload() + config = LoraConfig(target_parameters=["lin0.weight", "lin1.weight"]) + large_model = get_peft_model(large_model, config) + model_status = large_model.get_model_status() + model_status = large_model.get_model_status() + assert model_status.adapter_model_type == "LoraModel" + assert model_status.peft_types == {"default": "LORA", "other": "LORA"} + assert model_status.num_adapter_layers == 2 + assert model_status.trainable_params == 2 * (8 * 10 + 10 * 8) + + def test_model_target_parameters_and_target_modules(self, large_model): + # don't check each attribute, just the relevant ones + # first remove the normal LoRA layers + large_model = large_model.merge_and_unload() + config = LoraConfig(target_parameters=["lin0.weight"], target_modules=["lin1"]) + large_model = get_peft_model(large_model, config) + model_status = large_model.get_model_status() + assert model_status.adapter_model_type == "LoraModel" + assert model_status.peft_types == {"default": "LORA", "other": "LORA"} + assert model_status.num_adapter_layers == 2 + assert model_status.trainable_params == 2 * (8 * 10 + 10 * 8) + + def test_model_status_with_modules_to_save(self, small_base_model_cls): + # check that modules_to_save are correctly reported in layer status + model = small_base_model_cls() + num_base_params = sum(p.numel() for p in small_base_model_cls().parameters()) + config = LoraConfig(target_modules=["lin0"], modules_to_save=["lin1"]) + model = get_peft_model(model, config) + model_status = model.get_model_status() + + assert model_status.base_model_type == "SmallModel" + assert model_status.adapter_model_type == "LoraModel" + assert model_status.peft_types == {"default": "LORA"} + # 2 x 80 for LoRA, 100 for modules_to_save.weight, 10 for modules_to_save.bias + assert model_status.trainable_params == 2 * 80 + 100 + 10 + assert model_status.total_params == 2 * 80 + 100 + 10 + num_base_params + assert model_status.num_adapter_layers == 2 # lin0 + lin1 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": True} + assert model_status.available_adapters == ["default"] + assert model_status.devices == {"default": ["cpu"]} # all on CPU + + def test_model_status_with_trainable_tokens(self, small_base_emb_model_cls): + # check that trainable_token_indices are correctly reported in layer status + model = small_base_emb_model_cls() + num_base_params = sum(p.numel() for p in small_base_emb_model_cls().parameters()) + config = LoraConfig(target_modules=["lin0"], trainable_token_indices={"emb": [0, 1, 2]}) + model = get_peft_model(model, config) + model_status = model.get_model_status() + + assert model_status.base_model_type == "SmallEmbModel" + assert model_status.adapter_model_type == "LoraModel" + assert model_status.peft_types == {"default": "LORA"} + # 2 x 80 for LoRA, 3 x 10 for trainable tokens + assert model_status.trainable_params == 2 * 80 + 3 * 10 + assert model_status.total_params == 2 * 80 + 3 * 10 + num_base_params + assert model_status.num_adapter_layers == 2 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": True} + assert model_status.available_adapters == ["default"] + assert model_status.devices == {"default": ["cpu"]} # all on CPU + + def test_loha_model(self): + # ensure that this also works with non-LoRA, it's not necessary to test all tuners + class SmallModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.lin1 = nn.Linear(10, 10) + + base_model = SmallModel() + config = LoHaConfig(target_modules=["lin0", "lin1"], init_weights=False) + model = get_peft_model(base_model, config) + + model_status = model.get_model_status() + layer_status = model.get_layer_status() + + assert model_status.base_model_type == "SmallModel" + assert model_status.adapter_model_type == "LoHaModel" + assert model_status.peft_types == {"default": "LOHA"} + assert model_status.trainable_params == 640 + assert model_status.total_params == 860 + assert model_status.num_adapter_layers == 2 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": True} + assert model_status.available_adapters == ["default"] + assert model_status.devices == {"default": ["cpu"]} + + layer_status0 = layer_status[0] + assert len(layer_status) == 2 + assert layer_status0.name == "model.lin0" + assert layer_status0.module_type == "loha.Linear" + assert layer_status0.enabled is True + assert layer_status0.active_adapters == ["default"] + assert layer_status0.merged_adapters == [] + assert layer_status0.requires_grad == {"default": True} + assert layer_status0.available_adapters == ["default"] + assert layer_status0.devices == {"default": ["cpu"]} + + @require_non_cpu + def test_vera_model(self): + # let's also test VeRA because it uses BufferDict + class SmallModel(nn.Module): + def __init__(self): + super().__init__() + self.lin0 = nn.Linear(10, 10) + self.lin1 = nn.Linear(10, 10) + + base_model = SmallModel() + config = VeraConfig(target_modules=["lin0", "lin1"], init_weights=False) + model = get_peft_model(base_model, config) + + # move the buffer dict to GPU + model.lin0.vera_A["default"] = model.lin0.vera_A["default"].to(self.torch_device) + + model_status = model.get_model_status() + layer_status = model.get_layer_status() + + assert model_status.base_model_type == "SmallModel" + assert model_status.adapter_model_type == "VeraModel" + assert model_status.peft_types == {"default": "VERA"} + assert model_status.trainable_params == 532 + assert model_status.total_params == 752 + assert model_status.num_adapter_layers == 2 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": True} + assert model_status.available_adapters == ["default"] + assert model_status.devices == {"default": ["cpu", self.torch_device]} + + layer_status0 = layer_status[0] + assert len(layer_status) == 2 + assert layer_status0.name == "model.lin0" + assert layer_status0.module_type == "vera.Linear" + assert layer_status0.enabled is True + assert layer_status0.active_adapters == ["default"] + assert layer_status0.merged_adapters == [] + assert layer_status0.requires_grad == {"default": True} + assert layer_status0.available_adapters == ["default"] + assert layer_status0.devices == {"default": ["cpu", self.torch_device]} + + ################### + # non-PEFT models # + ################### + + def test_transformers_model(self): + model_id = "peft-internal-testing/gpt2-lora-random" + # note that loading through AutoModelForCausalLM.from_pretrained does not enable training mode, hence + # requires_grad=False + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + model_status = get_model_status(model) + layer_status = get_layer_status(model) + + assert model_status.base_model_type == "GPT2LMHeadModel" + assert model_status.adapter_model_type == "None" + assert model_status.peft_types == {} + assert model_status.trainable_params == 0 + assert model_status.total_params == 124734720 + assert model_status.num_adapter_layers == 12 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": False} + assert model_status.available_adapters == ["default"] + assert model_status.devices == {"default": ["cpu"]} + + layer_status0 = layer_status[0] + assert len(layer_status) == 12 + assert layer_status0.name == "transformer.h.0.attn.c_attn" + assert layer_status0.module_type == "lora.Linear" + assert layer_status0.enabled is True + assert layer_status0.active_adapters == ["default"] + assert layer_status0.merged_adapters == [] + assert layer_status0.requires_grad == {"default": False} + assert layer_status0.available_adapters == ["default"] + assert layer_status0.devices == {"default": ["cpu"]} + + def test_model_with_injected_layers(self, large_model): + model = large_model.base_model.model + model_status = get_model_status(model) + layer_status = get_layer_status(model) + + assert model_status.base_model_type == "other" + assert model_status.adapter_model_type == "None" + assert model_status.peft_types == {} + assert model_status.trainable_params == 616 + assert model_status.total_params == 2236 + assert model_status.num_adapter_layers == 4 + assert model_status.enabled is True + assert model_status.active_adapters == ["default"] + assert model_status.merged_adapters == [] + assert model_status.requires_grad == {"default": True, "other": False} + assert model_status.available_adapters == ["default", "other"] + assert model_status.devices == {"default": ["cpu"], "other": ["cpu"]} + + layer_status1 = layer_status[1] + assert len(layer_status) == 4 + assert layer_status1.name == "emb0" + assert layer_status1.module_type == "lora.Embedding" + assert layer_status1.enabled is True + assert layer_status1.active_adapters == ["default"] + assert layer_status1.merged_adapters == [] + assert layer_status1.requires_grad == {"default": True} + assert layer_status1.available_adapters == ["default"] + assert layer_status1.devices == {"default": ["cpu"]} + + ############### + # error cases # + ############### + + def test_vanilla_model_raises(self): + model = nn.Linear(10, 10) + # note: full error message is longer + with pytest.raises(ValueError, match="No adapter layers found in the model"): + get_layer_status(model) + + with pytest.raises(ValueError, match="No adapter layers found in the model"): + get_model_status(model) + + def test_transformer_model_without_adapter_raises(self): + model_id = "gpt2" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + # note: full error message is longer + with pytest.raises(ValueError, match="No adapter layers found in the model"): + get_layer_status(model) + + with pytest.raises(ValueError, match="No adapter layers found in the model"): + get_model_status(model) + + def test_prefix_tuning(self): + model_id = "hf-internal-testing/tiny-random-BartForConditionalGeneration" + with hub_online_once(model_id): + model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + config = PromptTuningConfig(task_type="SEQ_2_SEQ_LM", num_virtual_tokens=10) + model = get_peft_model(model, config) + + # note: full error message is longer + with pytest.raises(TypeError, match=re.escape("get_layer_status() got an invalid PeftModel instance")): + model.get_layer_status() + + with pytest.raises(TypeError, match=re.escape("get_model_status() got an invalid PeftModel instance")): + model.get_model_status() + + def test_adaption_prompt(self): + model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM" + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + config = AdaptionPromptConfig(adapter_layers=1, adapter_len=4) + model = get_peft_model(model, config) + + # note: full error message is longer + with pytest.raises(TypeError, match=re.escape("get_layer_status() got an invalid PeftModel instance")): + model.get_layer_status() + + with pytest.raises(TypeError, match=re.escape("get_model_status() got an invalid PeftModel instance")): + model.get_model_status() + + def test_mixed_model_raises(self): + class SimpleNet(nn.Module): + def __init__(self, bias=True): + super().__init__() + # note: out_features must be > rank or else OFT will be an identity transform + self.lin0 = nn.Linear(10, 20, bias=bias) + self.relu = nn.ReLU() + self.lin1 = nn.Linear(20, 16, bias=bias) + + def forward(self, X): + X = X.float() + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + return X + + base_model = SimpleNet() + config0 = LoraConfig(target_modules=["lin0"], init_lora_weights=False) + config1 = LoHaConfig(target_modules=["lin0", "lin1"], init_weights=False) + model = get_peft_model(base_model, config0, adapter_name="adapter0", mixed="mixed") + model.add_adapter("adapter1", config1) + + # note: full error message is longer + with pytest.raises(TypeError, match="get_layer_status is not supported for PeftMixedModel"): + model.get_layer_status() + + with pytest.raises(TypeError, match="get_model_status is not supported for PeftMixedModel"): + model.get_model_status() + + +# Tests for BaseTuner +class MockModelConfig: + config = {"mock_key": "mock_value"} + + def to_dict(self): + return self.config + + +@dataclasses.dataclass +class MockModelDataclassConfig: + mock_key: str + + +class ModelWithConfig(nn.Module): + def __init__(self): + self.config = MockModelConfig() + + +class ModelWithDictConfig(nn.Module): + def __init__(self): + self.config = MockModelConfig.config + + +class ModelWithDataclassConfig(nn.Module): + def __init__(self): + self.config = MockModelDataclassConfig(**MockModelConfig().to_dict()) + + +class ModelWithNoConfig(nn.Module): + pass + + +class TestBaseTunerGetModelConfig(unittest.TestCase): + def test_get_model_config_use_to_dict(self): + config = BaseTuner.get_model_config(ModelWithConfig()) + assert config == MockModelConfig.config + + def test_get_model_config_as_dict(self): + config = BaseTuner.get_model_config(ModelWithDictConfig()) + assert config == MockModelConfig.config + + def test_get_model_config_with_no_config(self): + config = BaseTuner.get_model_config(ModelWithNoConfig()) + assert config == DUMMY_MODEL_CONFIG + + def test_get_model_config_with_dataclass(self): + config = BaseTuner.get_model_config(ModelWithDataclassConfig()) + assert config == MockModelConfig.config + + +class TestBaseTunerWarnForTiedEmbeddings: + model_id = "HuggingFaceH4/tiny-random-LlamaForCausalLM" + warn_end_inject = "huggingface/peft/issues/2018." + warn_end_merge = ( + "# Now use the original model but in untied format\n" + "model = AutoModelForCausalLM.from_pretrained(untied_model_dir)\n```\n" + ) + + def _get_peft_model(self, tie_word_embeddings, target_module): + with hub_online_once(self.model_id): + base_model = AutoModelForCausalLM.from_pretrained(self.model_id, tie_word_embeddings=tie_word_embeddings) + model = get_peft_model( + base_model, + LoraConfig(target_modules=[target_module]), + ) + return model + + def _is_warn_triggered(self, warning_list, endswith): + return any(str(warning.message).endswith(endswith) for warning in warning_list) + + def test_warn_for_tied_embeddings_inject(self, recwarn): + self._get_peft_model(tie_word_embeddings=True, target_module="lm_head") + assert self._is_warn_triggered(recwarn.list, self.warn_end_inject) + + def test_warn_for_tied_embeddings_merge(self, recwarn): + model = self._get_peft_model(tie_word_embeddings=True, target_module="lm_head") + model.merge_and_unload() + assert self._is_warn_triggered(recwarn.list, self.warn_end_merge) + + def test_no_warn_for_untied_embeddings_inject(self, recwarn): + self._get_peft_model(tie_word_embeddings=False, target_module="lm_head") + assert not self._is_warn_triggered(recwarn.list, self.warn_end_inject) + + def test_no_warn_for_untied_embeddings_merge(self, recwarn): + model_not_tied = self._get_peft_model(tie_word_embeddings=False, target_module="lm_head") + model_not_tied.merge_and_unload() + assert not self._is_warn_triggered(recwarn.list, self.warn_end_merge) + + def test_no_warn_for_no_target_module_inject(self, recwarn): + self._get_peft_model(tie_word_embeddings=True, target_module="q_proj") + assert not self._is_warn_triggered(recwarn.list, self.warn_end_inject) + + def test_no_warn_for_no_target_module_merge(self, recwarn): + model_no_target_module = self._get_peft_model(tie_word_embeddings=True, target_module="q_proj") + model_no_target_module.merge_and_unload() + assert not self._is_warn_triggered(recwarn.list, self.warn_end_merge) + + +class TestFindMinimalTargetModules: + @pytest.mark.parametrize( + "target_modules, other_module_names, expected", + [ + (["bar"], [], {"bar"}), + (["foo"], ["bar"], {"foo"}), + (["1.foo", "2.foo"], ["3.foo", "4.foo"], {"1.foo", "2.foo"}), + # Could also return "bar.baz" but we want the shorter one + (["bar.baz"], ["foo.bar"], {"baz"}), + (["1.foo", "2.foo", "bar.baz"], ["3.foo", "bar.bla"], {"1.foo", "2.foo", "baz"}), + # Case with longer suffix chains and nested suffixes + (["a.b.c", "d.e.f", "g.h.i"], ["j.k.l", "m.n.o"], {"c", "f", "i"}), + (["a.b.c", "d.e.f", "g.h.i"], ["a.b.x", "d.x.f", "x.h.i"], {"c", "e.f", "g.h.i"}), + # Case with multiple items that can be covered by a single suffix + (["foo.bar.baz", "qux.bar.baz"], ["baz.bar.foo"], {"baz"}), + # Realistic examples + # Only match k_proj + ( + ["model.decoder.layers.{i}.self_attn.k_proj" for i in range(12)], + ( + ["model.decoder.layers.{i}.self_attn" for i in range(12)] + + ["model.decoder.layers.{i}.self_attn.v_proj" for i in range(12)] + + ["model.decoder.layers.{i}.self_attn.q_proj" for i in range(12)] + ), + {"k_proj"}, + ), + # Match all k_proj except the one in layer 5 => no common suffix + ( + ["model.decoder.layers.{i}.self_attn.k_proj" for i in range(12) if i != 5], + ( + ["model.decoder.layers.5.self_attn.k_proj"] + + ["model.decoder.layers.{i}.self_attn" for i in range(12)] + + ["model.decoder.layers.{i}.self_attn.v_proj" for i in range(12)] + + ["model.decoder.layers.{i}.self_attn.q_proj" for i in range(12)] + ), + {"{i}.self_attn.k_proj" for i in range(12) if i != 5}, + ), + ], + ) + def test_find_minimal_target_modules(self, target_modules, other_module_names, expected): + # check all possible combinations of list and set + result = find_minimal_target_modules(target_modules, other_module_names) + assert result == expected + + result = find_minimal_target_modules(set(target_modules), other_module_names) + assert result == expected + + result = find_minimal_target_modules(target_modules, set(other_module_names)) + assert result == expected + + result = find_minimal_target_modules(set(target_modules), set(other_module_names)) + assert result == expected + + def test_find_minimal_target_modules_empty_raises(self): + with pytest.raises(ValueError, match="target_modules should be a list or set of strings"): + find_minimal_target_modules([], ["foo"]) + + with pytest.raises(ValueError, match="target_modules should be a list or set of strings"): + find_minimal_target_modules(set(), ["foo"]) + + def test_find_minimal_target_modules_contains_empty_string_raises(self): + target_modules = ["", "foo", "bar.baz"] + other_module_names = ["bar"] + with pytest.raises(ValueError, match="target_modules should not contain an empty string"): + find_minimal_target_modules(target_modules, other_module_names) + + def test_find_minimal_target_modules_string_raises(self): + target_modules = "foo" + other_module_names = ["bar"] + with pytest.raises(ValueError, match="target_modules should be a list or set of strings"): + find_minimal_target_modules(target_modules, other_module_names) + + @pytest.mark.parametrize( + "target_modules, other_module_names", + [ + (["foo"], ["foo"]), + (["foo.bar"], ["foo.bar"]), + (["foo.bar", "spam", "eggs"], ["foo.bar"]), + (["foo.bar", "spam"], ["foo.bar", "eggs"]), + (["foo.bar"], ["foo.bar", "spam", "eggs"]), + ], + ) + def test_find_minimal_target_modules_not_disjoint_raises(self, target_modules, other_module_names): + msg = ( + "target_modules and other_module_names contain common elements, this should not happen, please " + "open a GitHub issue at https://github.com/huggingface/peft/issues with the code to reproduce this issue" + ) + with pytest.raises(ValueError, match=msg): + find_minimal_target_modules(target_modules, other_module_names) + + def test_get_peft_model_applies_find_target_modules(self): + # Check that when calling get_peft_model, the target_module optimization is indeed applied if the length of + # target_modules is big enough. The resulting model itself should be unaffected. + torch.manual_seed(0) + model_id = "facebook/opt-125m" # must be big enough for optimization to trigger + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + + # base case: specify target_modules in a minimal fashion + config = LoraConfig(init_lora_weights=False, target_modules=["q_proj", "v_proj"]) + model = get_peft_model(model, config) + + # this list contains all targeted modules listed separately + big_target_modules = [name for name, module in model.named_modules() if isinstance(module, LoraLayer)] + # sanity check + assert len(big_target_modules) > MIN_TARGET_MODULES_FOR_OPTIMIZATION + + # make a "checksum" of the model for comparison + model_check_sum_before = sum(p.sum() for p in model.parameters()) + + # strip prefix so that the names they can be used as new target_modules + prefix_to_strip = "base_model.model.model." + big_target_modules = [name[len(prefix_to_strip) :] for name in big_target_modules] + + del model + + torch.manual_seed(0) + with hub_online_once(model_id): + model = AutoModelForCausalLM.from_pretrained(model_id) + # pass the big target_modules to config + config = LoraConfig(init_lora_weights=False, target_modules=big_target_modules) + model = get_peft_model(model, config) + + # check that target modules have been condensed + assert model.peft_config["default"].target_modules == {"q_proj", "v_proj"} + + # check that the resulting model is still the same + model_check_after = sum(p.sum() for p in model.parameters()) + assert model_check_sum_before == model_check_after + + def test_suffix_is_substring_of_other_suffix(self): + # This test is based on a real world bug found in diffusers. The issue was that we needed the suffix + # 'time_emb_proj' in the minimal target modules. However, if there already was the suffix 'proj' in the + # required_suffixes, 'time_emb_proj' would not be added because the test was `endswith(suffix)` and + # 'time_emb_proj' ends with 'proj'. The correct logic is to test if `endswith("." + suffix")`. The module names + # chosen here are only a subset of the hundreds of actual module names but this subset is sufficient to + # replicate the bug. + target_modules = [ + "down_blocks.1.attentions.0.transformer_blocks.0.ff.net.0.proj", + "mid_block.attentions.0.transformer_blocks.0.ff.net.0.proj", + "up_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj", + "mid_block.attentions.0.proj_out", + "up_blocks.0.attentions.0.proj_out", + "down_blocks.1.attentions.0.proj_out", + "up_blocks.0.resnets.0.time_emb_proj", + "down_blocks.0.resnets.0.time_emb_proj", + "mid_block.resnets.0.time_emb_proj", + ] + other_module_names = [ + "conv_in", + "time_proj", + "time_embedding", + "time_embedding.linear_1", + "add_time_proj", + "add_embedding", + "add_embedding.linear_1", + "add_embedding.linear_2", + "down_blocks", + "down_blocks.0", + "down_blocks.0.resnets", + "down_blocks.0.resnets.0", + "up_blocks", + "up_blocks.0", + "up_blocks.0.attentions", + "up_blocks.0.attentions.0", + "up_blocks.0.attentions.0.norm", + "up_blocks.0.attentions.0.transformer_blocks", + "up_blocks.0.attentions.0.transformer_blocks.0", + "up_blocks.0.attentions.0.transformer_blocks.0.norm1", + "up_blocks.0.attentions.0.transformer_blocks.0.attn1", + ] + expected = {"time_emb_proj", "proj", "proj_out"} + result = find_minimal_target_modules(target_modules, other_module_names) + assert result == expected + + def test_get_peft_modules_module_name_is_suffix_of_another_module(self): + # Solves the following bug: + # https://github.com/huggingface/diffusers/pull/9622#issuecomment-2404789721 + + # The cause for the bug is as follows: When we have, say, a module called "bar.0.query" that we want to target + # and another module called "foo_bar.0.query" that we don't want to target, there was potential for an error. + # This is not caused by _find_minimal_target_modules directly, but rather the bug was inside of + # BaseTuner.inject_adapter and how the names_no_target were chosen. Those used to be chosen based on suffix. In + # our example, however, "bar.0.query" is a suffix of "foo_bar.0.query", therefore "foo_bar.0.query" was *not* + # added to names_no_target when it should have. As a consequence, during the optimization, it looks like "query" + # is safe to use as target_modules because we don't see that it wrongly matches "foo_bar.0.query". + + # ensure that we have sufficiently many modules to trigger the optimization + n_layers = MIN_TARGET_MODULES_FOR_OPTIMIZATION + 1 + + class InnerModule(nn.Module): + def __init__(self): + super().__init__() + self.query = nn.Linear(10, 10) + + class OuterModule(nn.Module): + def __init__(self): + super().__init__() + # note that "transformer_blocks" is a suffix of "single_transformer_blocks" + self.transformer_blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)]) + self.single_transformer_blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)]) + + # we want to match all "transformer_blocks" layers but not "single_transformer_blocks" + target_modules = [f"transformer_blocks.{i}.query" for i in range(n_layers)] + model = get_peft_model(OuterModule(), LoraConfig(target_modules=target_modules)) + + # sanity check: we should have n_layers PEFT layers in model.transformer_blocks + transformer_blocks = model.base_model.model.transformer_blocks + assert sum(isinstance(module, BaseTunerLayer) for module in transformer_blocks.modules()) == n_layers + + # we should not have any PEFT layers in model.single_transformer_blocks + single_transformer_blocks = model.base_model.model.single_transformer_blocks + assert not any(isinstance(module, BaseTunerLayer) for module in single_transformer_blocks.modules()) + + # target modules should *not* be simplified to "query" as that would match "single_transformers_blocks" too + assert model.peft_config["default"].target_modules != {"query"} + + def test_find_minimal_target_modules_does_not_error_with_ia3(self, tmp_path): + # See #2429 + # There is an issue with the compression of the target_modules attribute when using IA³. There, we additionally + # have the feedforward_modules attribute, which must be subset of target_modules. When target_modules is shrunk, + # the subset check will fail. This test ensures that this doesn't happen. + n_layers = MIN_TARGET_MODULES_FOR_OPTIMIZATION + 1 + + class InnerModule(nn.Module): + def __init__(self): + super().__init__() + self.query = nn.Linear(10, 10) + + class OuterModule(nn.Module): + def __init__(self): + super().__init__() + self.blocks = nn.ModuleList([InnerModule() for _ in range(n_layers)]) + + target_modules = [f"blocks.{i}.query" for i in range(n_layers)] + feedforward_modules = [f"blocks.{i}.query" for i in range(n_layers)] + # the subset check happens here + config = IA3Config(target_modules=target_modules, feedforward_modules=feedforward_modules) + # the optimization step happens here, after the subset check, so at first we're fine, but we will run into an + # issue after a save/load roundtrip + model = get_peft_model(OuterModule(), config) + model.save_pretrained(tmp_path) + del model + + # does not raise + PeftModel.from_pretrained(OuterModule(), tmp_path) + + +class TestRankAndAlphaPattern: + @pytest.fixture + def model(self): + # we always target the foo layers, the *bar* layers are used as a control group to ensure that they are not + # accidentally targeted + class Inner(nn.Module): + def __init__(self): + super().__init__() + self.foo = nn.Linear(1, 1) + self.barfoo = nn.Linear(1, 1) + + class Middle(nn.Module): + def __init__(self): + super().__init__() + self.foo = nn.Linear(1, 1) + self.foobar = nn.Linear(1, 1) + self.module = Inner() + + class Outer(nn.Module): + def __init__(self): + super().__init__() + self.foo = nn.Linear(1, 1) + self.bar = nn.Linear(1, 1) + self.module = Middle() + + # resulting model for overview: + # Outer( + # (foo): Linear(...) + # (bar): Linear(...) + # (module): Middle( + # (foo): Linear(...) + # (foobar): Linear(...) + # (module): Inner( + # (foo): Linear(...) + # (barfoo): Linear(...) + # ) + # ) + # ) + + return Outer() + + def test_no_rank_nor_alpha_pattern(self, model): + # sanity check the default case, no rank or alpha pattern + config = LoraConfig(target_modules="all-linear") + model = get_peft_model(model, config).base_model.model + # r is the default rank and alpha, thus scaling is 1.0 + assert model.foo.r["default"] == 8 + assert model.foo.scaling["default"] == 1.0 + assert model.bar.r["default"] == 8 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.r["default"] == 8 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.r["default"] == 8 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.r["default"] == 8 + assert model.module.module.foo.scaling["default"] == 1.0 + assert model.module.module.barfoo.r["default"] == 8 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_rank_and_alpha_pattern_no_matching_keys(self, model): + # sanity check for non-matching keys, no rank or alpha pattern + config = LoraConfig(target_modules="all-linear", rank_pattern={"bla": 4, "oof": 6}, alpha_pattern={"baz": 3}) + model = get_peft_model(model, config).base_model.model + # r is the default rank and alpha, thus scaling is 1.0 + assert model.foo.r["default"] == 8 + assert model.foo.scaling["default"] == 1.0 + assert model.bar.r["default"] == 8 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.r["default"] == 8 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.r["default"] == 8 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.r["default"] == 8 + assert model.module.module.foo.scaling["default"] == 1.0 + assert model.module.module.barfoo.r["default"] == 8 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + # below, we test all permutations for rank_pattern of targeting outer, middle, and inner foo layers: + + def test_rank_pattern_target_all(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 16 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 16 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"^foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 8 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 8 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_middle(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"^module.foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 8 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 16 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 8 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_inner(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"module.module.foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 8 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 8 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 16 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_inner_with_caret(self, model): + # same as before, but using the caret in the regex should also work + config = LoraConfig(target_modules="all-linear", rank_pattern={"^module.module.foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 8 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 8 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 16 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_middle_inner(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"module.foo": 16}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 8 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 16 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 16 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_middle_inner_different_ranks(self, model): + # same layers targeted as in previous test, but with different ranks + config = LoraConfig(target_modules="all-linear", rank_pattern={"^module.foo": 16, "^module.module.foo": 24}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 8 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 16 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 24 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer_middle(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"^foo": 16, "^module.foo": 24}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 24 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 8 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer_inner(self, model): + config = LoraConfig(target_modules="all-linear", rank_pattern={"^foo": 16, "module.module.foo": 24}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 8 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 24 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer_inner_with_caret(self, model): + # same as before, but using the caret in the regex should also work + config = LoraConfig(target_modules="all-linear", rank_pattern={"^foo": 16, "^module.module.foo": 24}) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 8 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 24 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer_middle_inner_with_caret(self, model): + # indicate each layer with a different rank and use the caret in the regex + config = LoraConfig( + target_modules="all-linear", rank_pattern={"^foo": 16, "^module.foo": 24, "^module.module.foo": 32} + ) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 24 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 32 + assert model.module.module.barfoo.r["default"] == 8 + + def test_rank_pattern_target_outer_middle_inner_with_caret_dict_order(self, model): + # same as before, but change the order of the rank_pattern dict + config = LoraConfig( + target_modules="all-linear", rank_pattern={"^module.module.foo": 32, "^module.foo": 24, "^foo": 16} + ) + model = get_peft_model(model, config).base_model.model + assert model.foo.r["default"] == 16 + assert model.bar.r["default"] == 8 + assert model.module.foo.r["default"] == 24 + assert model.module.foobar.r["default"] == 8 + assert model.module.module.foo.r["default"] == 32 + assert model.module.module.barfoo.r["default"] == 8 + + # below, we test all permutations for alpha_pattern of targeting outer, middle, and inner foo layers: + # these tests are analogous to the rank_pattern tests above + + def test_alpha_pattern_target_all(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.5 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.5 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 1.0 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_middle(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^module.foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 1.0 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.5 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 1.0 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_inner(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"module.module.foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 1.0 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.5 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_inner_with_caret(self, model): + # same as before, but using the caret in the regex should also work + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^module.module.foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 1.0 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.5 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_middle_inner(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"module.foo": 4}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 1.0 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.5 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.5 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_middle_inner_different_alphas(self, model): + # same layers targeted as in previous test, but with different alphas + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^module.foo": 4, "^module.module.foo": 2}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 1.0 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.5 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.25 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer_middle(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^foo": 4, "^module.foo": 2}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.25 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 1.0 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer_inner(self, model): + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^foo": 4, "module.module.foo": 2}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.25 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer_inner_with_caret(self, model): + # same as before, but using the caret in the regex should also work + config = LoraConfig(target_modules="all-linear", alpha_pattern={"^foo": 4, "^module.module.foo": 2}) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 1.0 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.25 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer_middle_inner_with_caret(self, model): + # indicate each layer with a different alpha and use the caret in the regex + config = LoraConfig( + target_modules="all-linear", alpha_pattern={"^foo": 4, "^module.foo": 2, "^module.module.foo": 1} + ) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.25 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.125 + assert model.module.module.barfoo.scaling["default"] == 1.0 + + def test_alpha_pattern_target_outer_middle_inner_with_caret_dict_order(self, model): + # same as before, but change the order of the alpha_pattern dict + config = LoraConfig( + target_modules="all-linear", alpha_pattern={"^module.module.foo": 1, "^module.foo": 2, "^foo": 4} + ) + model = get_peft_model(model, config).base_model.model + assert model.foo.scaling["default"] == 0.5 + assert model.bar.scaling["default"] == 1.0 + assert model.module.foo.scaling["default"] == 0.25 + assert model.module.foobar.scaling["default"] == 1.0 + assert model.module.module.foo.scaling["default"] == 0.125 + assert model.module.module.barfoo.scaling["default"] == 1.0 diff --git a/peft/tests/test_vblora.py b/peft/tests/test_vblora.py new file mode 100644 index 0000000000000000000000000000000000000000..4a4801cab7904e68660a92695d0f05563d37d672 --- /dev/null +++ b/peft/tests/test_vblora.py @@ -0,0 +1,269 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from safetensors import safe_open +from torch import nn + +from peft import PeftModel, VBLoRAConfig, get_peft_model + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestVBLoRA: + def get_mlp(self): + model = MLP() + return model + + def test_vblora_parameters(self): + mlp = self.get_mlp() + vector_length = 2 + num_vectors = 10 + config = VBLoRAConfig( + target_modules=["lin0", "lin1", "lin3"], vector_length=vector_length, num_vectors=num_vectors + ) + mlp_vblora = get_peft_model(mlp, config) + + vector_bank = mlp_vblora.vblora_vector_bank["default"] + + vblora_lin0_logits_B = mlp_vblora.lin0.vblora_logits_B["default"] + assert vblora_lin0_logits_B.shape == (mlp.lin0.out_features // vector_length, config.r, num_vectors) + + vblora_lin1_logits_A = mlp_vblora.lin1.vblora_logits_A["default"] + assert vblora_lin1_logits_A.shape == (config.r, mlp.lin1.in_features // vector_length, num_vectors) + + vblora_lin3_logits_A = mlp_vblora.lin3.vblora_logits_A["default"] + assert vblora_lin3_logits_A.shape == (config.r, mlp.lin3.in_features // vector_length, num_vectors) + + assert vector_bank.shape == (num_vectors, vector_length) + + # test if the vector bank is shared across the layers + assert ( + mlp_vblora.lin0.vblora_vector_bank["default"].data_ptr() + == mlp_vblora.lin3.vblora_vector_bank["default"].data_ptr() + ) + assert mlp_vblora.lin1.vblora_vector_bank["default"].data_ptr() == vector_bank.data_ptr() + + # should not raise + input = torch.randn(5, 10) + mlp_vblora(input) + + def test_save_with_topk_weights(self, tmp_path): + torch.manual_seed(0) + mlp = self.get_mlp() + vector_length = 2 + num_vectors = 10 + topk = 2 + config = VBLoRAConfig( + target_modules=["lin0", "lin3"], + topk=topk, + vector_length=vector_length, + num_vectors=num_vectors, + save_only_topk_weights=True, + ) + mlp_vblora = get_peft_model(mlp, config) + save_path = tmp_path / "vblora" + mlp_vblora.save_pretrained(save_path) + assert os.path.exists(save_path / "adapter_model.safetensors") + + adapter_model_dict = {} + with safe_open(save_path / "adapter_model.safetensors", framework="pt") as f: + for k in f.keys(): + adapter_model_dict[k] = f.get_tensor(k) + assert "base_model.model.lin0.vblora_logits_A_topk_indices" in adapter_model_dict + assert "base_model.model.lin0.vblora_logits_A_topk_weights" in adapter_model_dict + assert "base_model.model.lin3.vblora_logits_B_topk_indices" in adapter_model_dict + assert "base_model.model.lin3.vblora_logits_B_topk_weights" in adapter_model_dict + assert "base_model.model.lin0.vblora_logits_A" not in adapter_model_dict + assert "base_model.model.lin3.vblora_logits_B" not in adapter_model_dict + + assert adapter_model_dict["base_model.model.lin0.vblora_logits_B_topk_indices"].shape == ( + mlp.lin0.out_features // vector_length, + config.r, + topk, + ) + assert adapter_model_dict["base_model.model.lin0.vblora_logits_B_topk_weights"].shape == ( + mlp.lin0.out_features // vector_length, + config.r, + topk - 1, + ) + assert adapter_model_dict["base_model.model.lin3.vblora_logits_A_topk_indices"].shape == ( + config.r, + mlp.lin3.in_features // vector_length, + topk, + ) + assert adapter_model_dict["base_model.model.lin3.vblora_logits_A_topk_weights"].shape == ( + config.r, + mlp.lin3.in_features // vector_length, + topk - 1, + ) + + @pytest.mark.parametrize("save_only_topk_weights", [True, False]) + def test_save_load(self, save_only_topk_weights, tmp_path): + torch.manual_seed(0) + mlp = self.get_mlp() + config = VBLoRAConfig( + target_modules=["lin0", "lin1", "lin3"], + topk=2, + vector_length=2, + num_vectors=10, + save_only_topk_weights=save_only_topk_weights, + ) + mlp_vblora = get_peft_model(mlp, config) + save_path = tmp_path / "vblora" + mlp_vblora.save_pretrained(save_path) + assert os.path.exists(save_path / "adapter_config.json") + + del mlp + torch.manual_seed(0) # make sure the base model has the same weights + mlp = self.get_mlp() + mlp_vblora_loaded = PeftModel.from_pretrained(mlp, save_path) + + input = torch.randn(5, 10) + output = mlp_vblora(input) + output_loaded = mlp_vblora_loaded(input) + assert torch.allclose(output, output_loaded, atol=1e-8, rtol=1e-5) + + def test_resume_training_model_with_topk_weights(self, tmp_path): + torch.manual_seed(1) + mlp = self.get_mlp() + config = VBLoRAConfig( + target_modules=["lin0", "lin1", "lin3"], + topk=2, + vector_length=2, + num_vectors=10, + save_only_topk_weights=True, + ) + mlp_vblora = get_peft_model(mlp, config) + save_path = tmp_path / "vblora" + mlp_vblora.save_pretrained(save_path) + + input = torch.randn(5, 10) + mlp_vblora.train() + # should not raise + mlp_vblora(input) + + del mlp + torch.manual_seed(1) + mlp = self.get_mlp() + mlp_vblora_loaded = PeftModel.from_pretrained(mlp, save_path) + mlp_vblora_loaded.train() + msg = "Found infinity values in VB-LoRA logits. Ensure training was not resumed from a `save_only_topk_weights` model." + with pytest.raises(RuntimeError, match=msg): + mlp_vblora_loaded(input) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_vblora_dtypes(self, dtype): + mlp = self.get_mlp() + if dtype == torch.bfloat16: + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + config = VBLoRAConfig( + target_modules=["lin0", "lin1", "lin3"], vector_length=2, num_vectors=10, save_only_topk_weights=False + ) + mlp_vblora = get_peft_model(mlp.to(dtype), config) + inputs = torch.randn(5, 10).to(dtype) + output = mlp_vblora(inputs) # should not raise + assert output.dtype == dtype + + def test_vblora_nb_savable_params_only_topk_weights(self): + mlp = self.get_mlp() + vector_length = 2 + num_vectors = 10 + topk = 2 + r = 4 + config = VBLoRAConfig( + target_modules=["lin0", "lin1"], + vector_length=vector_length, + num_vectors=num_vectors, + topk=topk, + r=r, + save_only_topk_weights=True, + ) + mlp_vblora = get_peft_model(mlp, config) + + mlp_vblora.lin3.requires_grad_(True) # set lin3 to trainable + + adapter_params, other_params = mlp_vblora.get_nb_savable_parameters() + factor = 0.25 # dtype of index is uint8 + topk_indices_parameter = int( + (mlp.lin0.out_features + mlp.lin0.in_features + mlp.lin1.out_features + mlp.lin1.in_features) + / vector_length + * r + * topk + * factor + ) + topk_weights_parameter = int( + (mlp.lin0.out_features + mlp.lin0.in_features + mlp.lin1.out_features + mlp.lin1.in_features) + / vector_length + * r + * (topk - 1) + ) + vector_bank_parameter = num_vectors * vector_length + assert adapter_params == topk_indices_parameter + topk_weights_parameter + vector_bank_parameter + assert other_params == (mlp.lin3.in_features + 1) * mlp.lin3.out_features + + def test_vblora_nb_savable_params_all_logits(self): + mlp = self.get_mlp() + vector_length = 2 + num_vectors = 10 + topk = 2 + r = 4 + config = VBLoRAConfig( + target_modules=["lin0", "lin1"], + vector_length=vector_length, + num_vectors=num_vectors, + topk=topk, + r=r, + save_only_topk_weights=False, + ) + mlp_vblora = get_peft_model(mlp, config) + + mlp_vblora.lin3.requires_grad_(True) # set lin3 to trainable + + adapter_params, other_params = mlp_vblora.get_nb_savable_parameters() + logits_parameter = int( + (mlp.lin0.out_features + mlp.lin0.in_features + mlp.lin1.out_features + mlp.lin1.in_features) + / vector_length + * r + * num_vectors + ) + vector_bank_parameter = num_vectors * vector_length + assert adapter_params == logits_parameter + vector_bank_parameter + assert other_params == (mlp.lin3.in_features + 1) * mlp.lin3.out_features diff --git a/peft/tests/test_vera.py b/peft/tests/test_vera.py new file mode 100644 index 0000000000000000000000000000000000000000..717dfb270aa823deccb28e822b6655e637b5b6be --- /dev/null +++ b/peft/tests/test_vera.py @@ -0,0 +1,298 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is for tests specific to VeRA, since VeRA has some specific challenges due to the shared weights. + +import os + +import pytest +import torch +from accelerate.utils.imports import is_bf16_available +from safetensors import safe_open +from torch import nn + +from peft import PeftModel, VeraConfig, get_peft_model + + +class MLP(nn.Module): + def __init__(self, bias=True): + super().__init__() + self.relu = nn.ReLU() + self.lin0 = nn.Linear(10, 20, bias=bias) + self.lin1 = nn.Linear(20, 20, bias=bias) # lin1 and lin2 have same shape + self.lin2 = nn.Linear(20, 20, bias=bias) + self.lin3 = nn.Linear(20, 2, bias=bias) + self.sm = nn.LogSoftmax(dim=-1) + + def forward(self, X): + X = self.lin0(X) + X = self.relu(X) + X = self.lin1(X) + X = self.relu(X) + X = self.lin2(X) + X = self.relu(X) + X = self.lin3(X) + X = self.sm(X) + return X + + +class TestVera: + @pytest.fixture + def mlp(self): + torch.manual_seed(0) + model = MLP() + return model + + @pytest.fixture + def mlp_same_prng(self, mlp): + torch.manual_seed(0) + + config = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False) + # creates a default VeRA adapter + peft_model = get_peft_model(mlp, config) + config2 = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model.add_adapter("other", config2) + return peft_model + + def test_multiple_adapters_same_prng_weights(self, mlp_same_prng): + # we can have multiple adapters with the same prng key, in which case the weights should be shared + assert ( + mlp_same_prng.base_model.model.lin1.vera_A["default"] + is mlp_same_prng.base_model.model.lin1.vera_A["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_B["default"] + is mlp_same_prng.base_model.model.lin1.vera_B["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin2.vera_A["default"] + is mlp_same_prng.base_model.model.lin2.vera_A["other"] + ) + assert ( + mlp_same_prng.base_model.model.lin2.vera_B["default"] + is mlp_same_prng.base_model.model.lin2.vera_B["other"] + ) + + input = torch.randn(5, 10) + mlp_same_prng.set_adapter("default") + output_default = mlp_same_prng(input) + mlp_same_prng.set_adapter("other") + output_other = mlp_same_prng(input) + assert not torch.allclose(output_default, output_other, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_different_prng_raises(self): + # we cannot have multiple adapters with different prng keys + model = MLP() + config = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False) + # creates a default VeRA adapter + peft_model = get_peft_model(model, config) + config2 = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False, projection_prng_key=123) + + msg = ( + r"Vera PRNG initialisation key must be the same for all adapters. Got config.projection_prng_key=123 but " + r"previous config had 0" + ) + with pytest.raises(ValueError, match=msg): + peft_model.add_adapter("other", config2) + + def test_multiple_adapters_save_load_save_projection_true(self, mlp_same_prng, tmp_path): + # check saving and loading works with multiple adapters and saved projection weights + torch.manual_seed(0) + input = torch.randn(5, 10) + mlp_same_prng.set_adapter("default") + output_default = mlp_same_prng(input) + mlp_same_prng.set_adapter("other") + output_other = mlp_same_prng(input) + + # sanity check + assert not torch.allclose(output_default, output_other, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "vera" + mlp_same_prng.save_pretrained(save_path) + assert os.path.exists(save_path / "adapter_config.json") + assert os.path.exists(save_path / "other" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path) + peft_model.load_adapter(save_path / "other", "other") + + peft_model.set_adapter("default") + output_default_loaded = peft_model(input) + peft_model.set_adapter("other") + output_other_loaded = peft_model(input) + + assert torch.allclose(output_default, output_default_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_other, output_other_loaded, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_save_load_save_projection_false(self, mlp, tmp_path): + # check saving and loading works with multiple adapters without saved projection weights + torch.manual_seed(1) + config = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + # creates a default VeRA adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + + input = torch.randn(5, 10) + peft_model.set_adapter("first") + output_first = peft_model(input) + peft_model.set_adapter("second") + output_second = peft_model(input) + + # sanity check + assert not torch.allclose(output_first, output_second, atol=1e-3, rtol=1e-3) + + save_path = tmp_path / "vera" + peft_model.save_pretrained(save_path) + assert os.path.exists(save_path / "first" / "adapter_config.json") + assert os.path.exists(save_path / "second" / "adapter_config.json") + + torch.manual_seed(0) + mlp = MLP() + peft_model = PeftModel.from_pretrained(mlp, save_path / "first", adapter_name="first") + peft_model.load_adapter(save_path / "second", "second") + + peft_model.set_adapter("first") + output_first_loaded = peft_model(input) + peft_model.set_adapter("second") + output_second_loaded = peft_model(input) + + assert torch.allclose(output_first, output_first_loaded, atol=1e-3, rtol=1e-3) + assert torch.allclose(output_second, output_second_loaded, atol=1e-3, rtol=1e-3) + + def test_multiple_adapters_save_projection_true_contains_vera_A_vera_B(self, mlp_same_prng, tmp_path): + # check that the state_dicts don't contain the projection weights + save_path = tmp_path / "vera" + mlp_same_prng.save_pretrained(save_path) + + sd_default = {} + with safe_open(save_path / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_default[key] = f.get_tensor(key) + + assert any("vera_A" in key for key in sd_default) + assert any("vera_B" in key for key in sd_default) + # default rank for VeRA is 256 + assert sd_default["base_model.vera_A"].shape == (256, 20) + assert sd_default["base_model.vera_B"].shape == (20, 256) + + sd_other = {} + with safe_open(save_path / "other" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_other[key] = f.get_tensor(key) + + assert any("vera_A" in key for key in sd_other) + assert any("vera_B" in key for key in sd_other) + assert sd_other["base_model.vera_A"].shape == (256, 20) + assert sd_other["base_model.vera_B"].shape == (20, 256) + + def test_multiple_adapters_save_projection_false_contains_no_vera_A_vera_B(self, mlp, tmp_path): + torch.manual_seed(1) + config = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + # creates a default VeRA adapter + peft_model = get_peft_model(mlp, config, adapter_name="first") + config2 = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False, save_projection=False) + peft_model.add_adapter("second", config2) + + save_path = tmp_path / "vera" + peft_model.save_pretrained(save_path) + + sd_default = {} + with safe_open(save_path / "first" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_default[key] = f.get_tensor(key) + + assert not any("vera_A" in key for key in sd_default) + assert not any("vera_B" in key for key in sd_default) + + sd_other = {} + with safe_open(save_path / "second" / "adapter_model.safetensors", framework="pt", device="cpu") as f: + for key in f.keys(): + sd_other[key] = f.get_tensor(key) + + assert not any("vera_A" in key for key in sd_other) + assert not any("vera_B" in key for key in sd_other) + + def test_vera_A_vera_B_share_memory(self, mlp_same_prng): + vera_A = mlp_same_prng.vera_A["default"] + vera_B = mlp_same_prng.vera_B["default"] + + # these tensors should share the same data + assert vera_A.data_ptr() == mlp_same_prng.base_model.model.lin1.vera_A["default"].data_ptr() + assert vera_B.data_ptr() == mlp_same_prng.base_model.model.lin1.vera_B["default"].data_ptr() + assert vera_A.data_ptr() == mlp_same_prng.base_model.model.lin2.vera_A["default"].data_ptr() + assert vera_B.data_ptr() == mlp_same_prng.base_model.model.lin2.vera_B["default"].data_ptr() + # sanity check: these tensors shouldn't share the same data + assert vera_A.data_ptr() != vera_B.data_ptr() + + def test_vera_lambda_dont_share_memory(self, mlp_same_prng): + # sanity check: these tensors shouldn't share the same data + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_b["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.vera_lambda_b["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_b["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.vera_lambda_b["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_b["other"].data_ptr() + != mlp_same_prng.base_model.model.lin2.vera_lambda_b["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_d["default"].data_ptr() + != mlp_same_prng.base_model.model.lin1.vera_lambda_d["other"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_d["default"].data_ptr() + != mlp_same_prng.base_model.model.lin2.vera_lambda_d["default"].data_ptr() + ) + assert ( + mlp_same_prng.base_model.model.lin1.vera_lambda_d["other"].data_ptr() + != mlp_same_prng.base_model.model.lin2.vera_lambda_d["other"].data_ptr() + ) + + def test_vera_different_shapes(self, mlp): + config = VeraConfig(target_modules=["lin0", "lin3"], init_weights=False) + mlp_different_shapes = get_peft_model(mlp, config) + + vera_A = mlp_different_shapes.vera_A["default"] + vera_B = mlp_different_shapes.vera_B["default"] + + # sanity check + assert mlp.lin0.base_layer.weight.shape != mlp.lin3.base_layer.weight.shape + + # lin0 has the largest output dimension, lin3 has the largest input dimension + # vera_A should have the shape of (rank, largest_in), vera_B should have the shape of (largest_out, rank) + assert vera_A.shape == (config.r, mlp.lin3.in_features) + assert vera_B.shape == (mlp.lin0.out_features, config.r) + + # should not raise + input = torch.randn(5, 10) + mlp_different_shapes(input) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) + def test_vera_dtypes(self, dtype): + if dtype == torch.bfloat16: + # skip if bf16 is not supported on hardware, see #1872 + if not is_bf16_available(): + pytest.skip("bfloat16 not supported on this system, skipping the test") + + model = MLP().to(dtype) + config = VeraConfig(target_modules=["lin1", "lin2"], init_weights=False) + peft_model = get_peft_model(model, config) + inputs = torch.randn(5, 10).to(dtype) + output = peft_model(inputs) # should not raise + assert output.dtype == dtype diff --git a/peft/tests/test_vision_models.py b/peft/tests/test_vision_models.py new file mode 100644 index 0000000000000000000000000000000000000000..74e5d654de26f398f8ea2562f5471d70c3f6115c --- /dev/null +++ b/peft/tests/test_vision_models.py @@ -0,0 +1,156 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is not a full on test suite of vision models, since we already run many tests on dummy models with Conv2d layers +# and on stable diffusion models. Instead, this file contains specific tests for bugs that have been found in the past. +import gc + +import numpy as np +import pytest +import torch +from accelerate.utils.memory import clear_device_cache +from safetensors.torch import load_file +from transformers import ( + AutoImageProcessor, + AutoModelForImageClassification, + AutoProcessor, + LlavaForConditionalGeneration, +) + +from peft import ( + HRAConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + OFTConfig, + PeftModel, + PrefixTuningConfig, + get_peft_model, +) + +from .testing_utils import load_cat_image + + +CONFIGS = { + "lora": LoraConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]), + "loha": LoHaConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]), + "lokr": LoKrConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]), + "oft": OFTConfig( + r=1, oft_block_size=0, target_modules=["convolution"], modules_to_save=["classifier", "normalization"] + ), + "hra": HRAConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"]), + # TODO: cannot use BOFT because some convolutional kernel dimensions are even (64) and others odd (147). There is no + # common denominator for the boft_block_size except 1, but using 1 results in an error in the fbd_cuda kernel: + # > Error in forward_fast_block_diag_cuda_kernel: an illegal memory access was encountered + # "boft": BOFTConfig(target_modules=["convolution"], modules_to_save=["classifier", "normalization"], boft_block_size=2), +} + + +# Ensure that models like Llava that pass past_key_values automatically do not fail, see #1938 +class TestPastKV: + def test_past_kv(self): + model_id = "peft-internal-testing/tiny-LlavaForConditionalGeneration" + prompt = "USER: \nWhat are these?\nASSISTANT:" + + # prepare model and inputs + model = LlavaForConditionalGeneration.from_pretrained( + model_id, + low_cpu_mem_usage=True, + ) + processor = AutoProcessor.from_pretrained(model_id) + raw_image = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8) + inputs = processor(text=prompt, images=raw_image, return_tensors="pt") + + # get peft model + peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20) + model = get_peft_model(model, peft_config) + # check that this does not raise + model(**inputs, output_hidden_states=True) + + +class TestResnet: + model_id = "hf-internal-testing/tiny-random-ResNetForImageClassification" + cat_image = load_cat_image() # for caching + + @pytest.fixture(autouse=True) + def teardown(self): + r""" + Efficient mechanism to free GPU memory after each test. Based on + https://github.com/huggingface/transformers/issues/21094 + """ + clear_device_cache(garbage_collection=True) + gc.collect() + + @pytest.fixture(scope="class") + def image_processor(self): + image_processor = AutoImageProcessor.from_pretrained(self.model_id) + return image_processor + + @pytest.fixture(scope="class") + def data(self, image_processor): + return image_processor(self.cat_image, return_tensors="pt") + + @pytest.mark.parametrize("config", CONFIGS.values(), ids=CONFIGS.keys()) + def test_model_with_batchnorm_reproducibility(self, config, tmp_path, data): + # see 1732 + torch.manual_seed(0) + model = AutoModelForImageClassification.from_pretrained(self.model_id) + model = get_peft_model(model, config) + + # record outputs before training + model.eval() + with torch.inference_mode(): + output_before = model(**data) + model.train() + + # train the model + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) + batch_size = 4 + max_steps = 5 * batch_size + labels = torch.zeros(1, 3) + labels[0, 1] = 1 + for i in range(0, max_steps, batch_size): + optimizer.zero_grad() + outputs = model(**data, labels=labels) + loss = outputs.loss + loss.backward() + optimizer.step() + + # record outputs after training + model.eval() + with torch.inference_mode(): + output_after = model(**data) + assert torch.isfinite(output_after.logits).all() + atol, rtol = 1e-4, 1e-4 + # sanity check: model was updated + assert not torch.allclose(output_before.logits, output_after.logits, atol=atol, rtol=rtol) + + # check saving the model and loading it + model.save_pretrained(tmp_path) + del model + + torch.manual_seed(0) + model = AutoModelForImageClassification.from_pretrained(self.model_id) + model = PeftModel.from_pretrained(model, tmp_path).eval() + with torch.inference_mode(): + output_loaded = model(**data) + assert torch.allclose(output_after.logits, output_loaded.logits, atol=atol, rtol=rtol) + + # ensure that the checkpoint file contains the buffers + model_running_mean = len([k for k in model.state_dict().keys() if "running_mean" in k]) + state_dict = load_file(tmp_path / "adapter_model.safetensors") + checkpoint_running_mean = len([k for k in state_dict.keys() if "running_mean" in k]) + # note that the model has twice as many "running_mean", as there is one copy per ModulesToSaveWrapper, we need + # to multiply by 2 to get the same number + assert model_running_mean == checkpoint_running_mean * 2 diff --git a/peft/tests/test_xlora.py b/peft/tests/test_xlora.py new file mode 100644 index 0000000000000000000000000000000000000000..724e7782cbfd508f3cb59c414a8e85f9bc438a15 --- /dev/null +++ b/peft/tests/test_xlora.py @@ -0,0 +1,426 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from functools import wraps + +import huggingface_hub +import pytest +import torch +from safetensors.torch import load_file +from transformers import AutoModelForCausalLM, AutoTokenizer + +from peft import LoraConfig, PeftType, TaskType, XLoraConfig, get_peft_model +from peft.peft_model import PeftModel +from peft.tuners.xlora.layer import XLoraLayer +from peft.utils import infer_device + + +def flaky(num_tries: int): + """Decorator for test functions that are flaky""" + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + for _ in range(num_tries): + try: + return func(*args, **kwargs) + except AssertionError as e: + print(f"Failed test {func.__name__} with error: {e}") + continue + raise AssertionError(f"Failed test {func.__name__} after {num_tries} tries") + + return wrapper + + return decorator + + +class TestXlora: + torch_device = infer_device() + + model_id = "facebook/opt-125m" + num_loras = 4 + + @pytest.fixture(scope="class") + def lora_dir(self, tmp_path_factory): + return tmp_path_factory.mktemp("lora") + + @pytest.fixture(scope="class") + def lora_embedding_dir(self, tmp_path_factory): + return tmp_path_factory.mktemp("lora_embedding") + + @pytest.fixture(scope="class") + def saved_lora_adapters(self, lora_dir): + file_names = [] + + lora_configs = [ + LoraConfig(task_type="CAUSAL_LM", target_modules=["q_proj", "v_proj"], init_lora_weights=False) + for _ in range(self.num_loras) + ] + # have 1 LoRA with different target modules + lora_configs[-1] = LoraConfig( + task_type="CAUSAL_LM", target_modules=["k_proj", "q_proj", "v_proj"], init_lora_weights=False + ) + + for i, lora_config in enumerate(lora_configs, start=1): + torch.manual_seed(i) + model = AutoModelForCausalLM.from_pretrained(self.model_id) + peft_model = get_peft_model(model, lora_config) + file_name = os.path.join(lora_dir, f"checkpoint-{i}") + peft_model.save_pretrained(file_name) + file_names.append(file_name) + return file_names + + @pytest.fixture(scope="class") + def saved_lora_embedding_adapters(self, lora_embedding_dir): + file_names = [] + for i in range(1, self.num_loras + 1): + torch.manual_seed(i) + lora_config = LoraConfig(task_type="CAUSAL_LM", init_lora_weights=False, target_modules=["embed_tokens"]) + model = AutoModelForCausalLM.from_pretrained(self.model_id) + peft_model = get_peft_model(model, lora_config) + file_name = os.path.join(lora_embedding_dir, f"checkpoint-{i}") + peft_model.save_pretrained(file_name) + file_names.append(file_name) + return file_names + + @pytest.fixture(scope="class") + def tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True, device_map=self.torch_device) + return tokenizer + + @pytest.fixture(scope="function") + def embedding_model(self, saved_lora_embedding_adapters): + model = AutoModelForCausalLM.from_pretrained(self.model_id) + model.config.use_cache = False + adapters = {str(i): file_name for i, file_name in enumerate(saved_lora_embedding_adapters)} + + peft_config = XLoraConfig( + task_type=TaskType.CAUSAL_LM, + peft_type=PeftType.XLORA, + hidden_size=model.config.hidden_size, + xlora_depth=8, + adapters=adapters, + ) + model = get_peft_model(model, peft_config).to(self.torch_device) + return model + + @pytest.fixture(scope="function") + def model(self, saved_lora_adapters): + model = AutoModelForCausalLM.from_pretrained(self.model_id) + model.config.use_cache = False + adapters = {str(i): file_name for i, file_name in enumerate(saved_lora_adapters)} + + peft_config = XLoraConfig( + task_type=TaskType.CAUSAL_LM, + peft_type=PeftType.XLORA, + hidden_size=model.config.hidden_size, + xlora_depth=8, + adapters=adapters, + ) + model = get_peft_model(model, peft_config).to(self.torch_device) + return model + + @pytest.fixture(scope="function") + def model_layerwise(self, saved_lora_adapters): + model = AutoModelForCausalLM.from_pretrained(self.model_id) + model.config.use_cache = False + adapters = {str(i): file_name for i, file_name in enumerate(saved_lora_adapters)} + + peft_config = XLoraConfig( + task_type=TaskType.CAUSAL_LM, + peft_type=PeftType.XLORA, + hidden_size=model.config.hidden_size, + xlora_depth=8, + adapters=adapters, + layerwise_scalings=True, + ) + model = get_peft_model(model, peft_config).to(self.torch_device) + return model + + def test_functional(self, tokenizer, model): + model.enable_scalings_logging() + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + def test_forward_hooks_are_cleaned_up(self, tokenizer, model): + # There was an issue that forward hooks would accumulate during generation, since one hook per forward step was + # being registered and generate would call forward multiple times. This is already undesirable, but to make it + # worse, only the last hook was removed, resulting in hooks accumulating. + # See https://github.com/huggingface/peft/issues/1472#issuecomment-3235817807 + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + model.generate(input_ids=inputs.to(self.torch_device), max_new_tokens=10) + num_hooks_gen1 = len(model.base_model.model.model.decoder.layers[0].self_attn.k_proj._forward_pre_hooks) + + model.generate(input_ids=inputs.to(self.torch_device), max_new_tokens=10) + num_hooks_gen2 = len(model.base_model.model.model.decoder.layers[0].self_attn.k_proj._forward_pre_hooks) + assert num_hooks_gen1 == num_hooks_gen2 == 0 + + def test_scalings_logging_methods(self, tokenizer, model): + model.enable_scalings_logging() + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + _ = model.get_latest_scalings() + # 32 is the numeber of max scalings. 3 is the number of prompt tokens. + assert 32 + 3 >= len(model.get_scalings_log()) > 0 + + model.disable_scalings_logging() + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + assert 32 >= len(model.get_scalings_log()) > 0 + + bucketed = model.get_bucketed_scalings_log() + keys = bucketed.keys() + # Once bucket for each token as we aren't using cache + assert len(bucketed) == 32 == len(keys) + seq_len = inputs.shape[1] + for key in keys: + assert len(bucketed[key][0]) == 1 + assert len(bucketed[key][1]) == 1 + assert bucketed[key][0][0] == key - seq_len + + model.clear_scalings_log() + assert len(model.get_scalings_log()) == 0 + + def test_misc_methods(self, tokenizer, model): + model.set_global_scaling_weight(1.5) + assert model.internal_xlora_classifier.config.global_scaling_weight == 1.5 + assert model.get_global_scaling_weight() == 1.5 + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + assert str(model) is not None + + # On CI (but not locally), this test is flaky since transformers v4.45.0. + @flaky(num_tries=5) + def test_save_load_functional(self, tokenizer, model, tmp_path): + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + before_logits = outputs[: inputs.shape[1] :] + assert torch.isfinite(before_logits).all() + + model.save_pretrained(save_directory=tmp_path) + + del model + + model = AutoModelForCausalLM.from_pretrained(self.model_id) + model.config.use_cache = False + model = PeftModel.from_pretrained(model=model, model_id=tmp_path).to(self.torch_device) + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + after_logits = outputs[: inputs.shape[1] :] + assert torch.isfinite(after_logits).all() + assert torch.equal(after_logits, before_logits) + + def test_save_load_functional_pt(self, tokenizer, model, tmp_path): + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + before_logits = outputs[: inputs.shape[1] :] + assert torch.isfinite(before_logits).all() + + model.save_pretrained(save_directory=tmp_path, safe_serialization=False) + + del model + + model = AutoModelForCausalLM.from_pretrained(self.model_id) + model.config.use_cache = False + model = PeftModel.from_pretrained(model=model, model_id=tmp_path, safe_serialization=False).to( + self.torch_device + ) + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + after_logits = outputs[: inputs.shape[1] :] + assert torch.isfinite(after_logits).all() + assert torch.equal(after_logits, before_logits), (after_logits, before_logits) + + def test_topk_lora(self, tokenizer, model): + model.set_topk_lora(2) + assert model.internal_xlora_classifier.config.top_k_lora == 2 + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + def test_softmax_topk(self, tokenizer, model): + # Just reach in to set the config + model.internal_xlora_classifier.config.top_k_lora = 2 + model.internal_xlora_classifier.config.enable_softmax = False + model.internal_xlora_classifier.config.enable_softmax_topk = True + + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + def test_set_override_scaling_pass_value(self, model): + # Defaults to 0 + assert model.internal_xlora_classifier.override_scaling_pass_value == 0.0 + + # Set it to 2 and make sure it actually is + model.set_scaling_pass_value(2) + assert model.internal_xlora_classifier.override_scaling_pass_value == 2 + assert model.internal_xlora_classifier.config.scaling_pass_value == 2 + + # Set it to None and make sure it is 1/n + model.set_scaling_pass_value(None) + assert model.internal_xlora_classifier.override_scaling_pass_value == 1 / self.num_loras + assert model.internal_xlora_classifier.config.scaling_pass_value == 1 / self.num_loras + + def test_functional_layerwise(self, tokenizer, model_layerwise): + model_layerwise.enable_scalings_logging() + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model_layerwise.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + def test_disable_adapter(self, tokenizer, model): + model.enable_scalings_logging() + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + with model.disable_adapter(): + outputs_disabled = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs_disabled[: inputs.shape[1] :]).all() + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + assert not torch.equal(outputs, outputs_disabled) + + def test_functional_embedding(self, tokenizer, embedding_model): + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = embedding_model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=32, + ) + assert torch.isfinite(outputs[: inputs.shape[1] :]).all() + + def test_xlora_loading_valid(self): + # This test also simulatenously tests the loading-from-hub functionality! + torch.manual_seed(123) + + model_id = "facebook/opt-125m" + model = AutoModelForCausalLM.from_pretrained(model_id) + model.config.use_cache = False + + adapters = [ + "peft-internal-testing/opt-125m-dummy-lora", + "peft-internal-testing/opt-125m-dummy-lora", + ] + adapters = {str(i): file_name for i, file_name in enumerate(adapters)} + + peft_config = XLoraConfig( + task_type=TaskType.CAUSAL_LM, + peft_type=PeftType.XLORA, + hidden_size=model.config.hidden_size, + adapters=adapters, + xlora_depth=8, + xlora_size=2048, + layerwise_scalings=True, + xlora_dropout_p=0.2, + ) + model = get_peft_model(model, peft_config) + + downloaded = huggingface_hub.hf_hub_download(repo_id=adapters["0"], filename="adapter_model.safetensors") + sd = load_file(downloaded) + w0 = model.base_model.model.model.decoder.layers[0].self_attn.q_proj.lora_A["0"].weight + w1 = sd["base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.weight"] + + assert torch.allclose(w0, w1) + + def test_scalings_storage(self, tokenizer, model): + model.enable_scalings_logging() + inputs = tokenizer.encode("Python is a", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=10, + ) + + latest_scalings = model.get_latest_scalings() + assert latest_scalings is not None, "get_latest_scalings() should not return None after generation" + assert isinstance(latest_scalings, torch.Tensor) + assert torch.isfinite(latest_scalings).all(), "Scalings should contain finite values" + + def test_per_token_normalization_with_softmax_topk(self, tokenizer, model, monkeypatch): + model.internal_xlora_classifier.config.top_k_lora = 2 + model.internal_xlora_classifier.config.enable_softmax = False + model.internal_xlora_classifier.config.enable_softmax_topk = True + + captured_data = [] + orig_get_maybe_topk_scalings = XLoraLayer.get_maybe_topk_scalings + + def mock_get_maybe_topk_scalings(self, scalings): + result = orig_get_maybe_topk_scalings(self, scalings) + if getattr(model, "internal_xlora_scalings", None) is not None: + captured_data.append(result) + return result + + monkeypatch.setattr(XLoraLayer, "get_maybe_topk_scalings", mock_get_maybe_topk_scalings) + + model.enable_scalings_logging() + inputs = tokenizer.encode("Test per token normalization", add_special_tokens=False, return_tensors="pt") + outputs = model.generate( + input_ids=inputs.to(self.torch_device), + max_new_tokens=1, + ) + + for scaling in captured_data: + weight_sums = scaling.sum(dim=-1) + assert torch.allclose(weight_sums, torch.ones_like(weight_sums), atol=1e-5), ( + "Per-token scaling weights are not normalized to sum to 1." + ) diff --git a/peft/tests/testing_common.py b/peft/tests/testing_common.py new file mode 100644 index 0000000000000000000000000000000000000000..9c49119bf2612946528ab5f779b1df76dace952b --- /dev/null +++ b/peft/tests/testing_common.py @@ -0,0 +1,1982 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import json +import os +import pickle +import platform +import re +import shutil +import tempfile +import warnings +from dataclasses import replace +from operator import attrgetter + +import pytest +import torch +import yaml +from diffusers import StableDiffusionPipeline +from packaging import version +from safetensors.torch import save_file + +from peft import ( + AdaLoraConfig, + BOFTConfig, + BoneConfig, + CPTConfig, + FourierFTConfig, + HRAConfig, + IA3Config, + LNTuningConfig, + LoHaConfig, + LoKrConfig, + LoraConfig, + MissConfig, + OFTConfig, + PeftModel, + PeftType, + PrefixTuningConfig, + PromptEncoderConfig, + PromptLearningConfig, + PromptTuningConfig, + RandLoraConfig, + VBLoRAConfig, + VeraConfig, + get_peft_model, + get_peft_model_state_dict, + inject_adapter_in_model, + prepare_model_for_kbit_training, +) +from peft.tuners._buffer_dict import BufferDict +from peft.tuners.lora import LoraLayer +from peft.tuners.tuners_utils import BaseTunerLayer +from peft.utils import ( + AuxiliaryTrainingWrapper, + ModulesToSaveWrapper, + TrainableTokensWrapper, + _get_submodules, + infer_device, +) + +from .testing_utils import get_state_dict, hub_online_once + + +CONFIG_TESTING_KWARGS = ( + # IA³ + { + "target_modules": None, + "feedforward_modules": None, + }, + # LoRA + { + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + }, + # prefix tuning + { + "num_virtual_tokens": 10, + }, + # prompt encoder + { + "num_virtual_tokens": 10, + "encoder_hidden_size": 32, + }, + # prompt tuning + { + "num_virtual_tokens": 10, + }, + # AdaLoRA + { + "target_modules": None, + "total_step": 1, + }, + # BOFT + { + "target_modules": None, + }, + # VeRA + { + "r": 8, + "target_modules": None, + "vera_dropout": 0.05, + "projection_prng_key": 0xFF, + "d_initial": 0.1, + "save_projection": True, + "bias": "none", + }, + # FourierFT + { + "n_frequency": 10, + "target_modules": None, + }, + # HRA + { + "target_modules": None, + }, + # VBLoRA + {"target_modules": None, "vblora_dropout": 0.05, "vector_length": 1, "num_vectors": 2}, + # OFT + { + "target_modules": None, + }, + # Bone + { + "target_modules": None, + "r": 2, + }, + # MiSS + { + "target_modules": None, + "r": 2, + }, + # LoRA + trainable_tokens + { + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "trainable_token_indices": [0, 1, 3], + }, + # RandLoRA + { + "r": 32, + "randlora_alpha": 64, + "target_modules": None, + "randlora_dropout": 0.05, + "projection_prng_key": 0xFF, + "save_projection": True, + "bias": "none", + }, + # CPT tuninig + { + "cpt_token_ids": [0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing + "cpt_mask": [1, 1, 1, 1, 1, 1, 1, 1], + "cpt_tokens_type_mask": [1, 2, 2, 2, 3, 3, 4, 4], + }, +) + +CLASSES_MAPPING = { + "ia3": (IA3Config, CONFIG_TESTING_KWARGS[0]), + "lora": (LoraConfig, CONFIG_TESTING_KWARGS[1]), + "prefix_tuning": (PrefixTuningConfig, CONFIG_TESTING_KWARGS[2]), + "prompt_encoder": (PromptEncoderConfig, CONFIG_TESTING_KWARGS[3]), + "prompt_tuning": (PromptTuningConfig, CONFIG_TESTING_KWARGS[4]), + "adalora": (AdaLoraConfig, CONFIG_TESTING_KWARGS[5]), + "boft": (BOFTConfig, CONFIG_TESTING_KWARGS[6]), + "vera": (VeraConfig, CONFIG_TESTING_KWARGS[7]), + "fourierft": (FourierFTConfig, CONFIG_TESTING_KWARGS[8]), + "hra": (HRAConfig, CONFIG_TESTING_KWARGS[9]), + "vblora": (VBLoRAConfig, CONFIG_TESTING_KWARGS[10]), + "oft": (OFTConfig, CONFIG_TESTING_KWARGS[11]), + "bone": (BoneConfig, CONFIG_TESTING_KWARGS[12]), + "miss": (MissConfig, CONFIG_TESTING_KWARGS[12]), + "lora+trainable_tokens": (LoraConfig, CONFIG_TESTING_KWARGS[13]), + "randlora": (RandLoraConfig, CONFIG_TESTING_KWARGS[14]), +} + +DECODER_MODELS_EXTRA = {"cpt": (CPTConfig, CONFIG_TESTING_KWARGS[15])} + + +class PeftCommonTester: + r""" + A large testing suite for testing common functionality of the PEFT models. + + Attributes: + torch_device (`torch.device`): + The device on which the tests will be run. + transformers_class (`transformers.PreTrainedModel`): + The transformers class that is being tested. + """ + + torch_device = infer_device() + transformers_class = None + + def prepare_inputs_for_common(self): + raise NotImplementedError + + def check_modelcard(self, tmp_dirname, model): + # check the generated README.md + filename = os.path.join(tmp_dirname, "README.md") + assert os.path.exists(filename) + with open(filename, encoding="utf-8") as f: + readme = f.read() + metainfo = re.search(r"---\n(.*?)\n---", readme, re.DOTALL).group(1) + dct = yaml.safe_load(metainfo) + assert dct["library_name"] == "peft" + + if hasattr(model, "config"): + assert dct["base_model"] == model.config.to_dict()["_name_or_path"] + else: # a custom model + assert "base_model" not in dct + + # The Hub expects the lora tag to be set for PEFT LoRA models since they + # have explicit support for things like inference. + if model.active_peft_config.peft_type.value == "LORA": + assert "lora" in dct["tags"] + + def check_config_json(self, tmp_dirname, model): + # check the generated config.json + filename = os.path.join(tmp_dirname, "adapter_config.json") + assert os.path.exists(filename) + with open(filename, encoding="utf-8") as f: + config = json.load(f) + + if hasattr(model, "config"): # custom models don't have a config attribute + assert config["base_model_name_or_path"] == model.config.to_dict()["_name_or_path"] + + def perturb_trainable_token_weights_if_used(self, model, config_kwargs, adapter_name="default", scale=1.0): + """TrainableTokensLayer is initialized to be a no-op by default. Since there's currently no way to pass + `init_weights=False` to the trainable tokens layer when used in conjunction with LoRA, we have to do it like + this to make sure that it is *not* a no-op (essentially simulating "training" of the adapter). + """ + if "trainable_token_indices" not in config_kwargs: + return + + token_wrapper = None + + if hasattr(model, "get_input_embeddings"): + token_wrapper = model.get_input_embeddings() + else: + for module in model.modules(): + if isinstance(module, TrainableTokensWrapper): + token_wrapper = module + break + + # for a model with trainable_token_indices there should always be a trainable token wrapper somewhere. + # if not, then there's something broken. + assert token_wrapper is not None + + token_wrapper.token_adapter.trainable_tokens_delta[adapter_name].data = ( + torch.rand_like(token_wrapper.token_adapter.trainable_tokens_delta[adapter_name].data) * scale + ) + + def _test_model_attr(self, model_id, config_cls, config_kwargs): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + + assert hasattr(model, "save_pretrained") + assert hasattr(model, "from_pretrained") + assert hasattr(model, "push_to_hub") + + def _test_adapter_name(self, model_id, config_cls, config_kwargs): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config, adapter_name="test-adapter") + correctly_converted = False + for n, _ in model.named_parameters(): + if "test-adapter" in n: + correctly_converted = True + break + + assert correctly_converted + + def _test_prepare_for_training(self, model_id, config_cls, config_kwargs): + if config_kwargs.get("trainable_token_indices", None) is not None: + # incompatible because trainable tokens is marking embeddings as trainable + self.skipTest("Trainable tokens is incompatible with this test.") + + # some tests require specific tokenizers, make sure that they can be fetched as well + with hub_online_once(model_id + config_kwargs.get("tokenizer_name_or_path", "")): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + + dummy_input = self.prepare_inputs_for_testing() + dummy_output = model.get_input_embeddings()(dummy_input["input_ids"]) + + assert not dummy_output.requires_grad + + # load with `prepare_model_for_kbit_training` + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + model = prepare_model_for_kbit_training(model) + + for param in model.parameters(): + assert not param.requires_grad + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + + # For backward compatibility + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + dummy_input = self.prepare_inputs_for_testing() + dummy_output = model.get_input_embeddings()(dummy_input["input_ids"]) + + assert dummy_output.requires_grad + + def _test_load_model_low_cpu_mem_usage(self, model_id, config_cls, config_kwargs): + # Ensure that low_cpu_mem_usage=True works for from_pretrained and load_adapter and that the resulting model's + # parameters are on the correct device. + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + + # note: not using the context manager here because it fails on Windows CI for some reason + tmp_dirname = tempfile.mkdtemp() + try: + model.save_pretrained(tmp_dirname) + + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + model = PeftModel.from_pretrained( + model, tmp_dirname, torch_device=self.torch_device, low_cpu_mem_usage=True + ) + assert {p.device.type for p in model.parameters()} == {self.torch_device} + + model.load_adapter(tmp_dirname, adapter_name="other", low_cpu_mem_usage=True) + assert {p.device.type for p in model.parameters()} == {self.torch_device} + finally: + try: + shutil.rmtree(tmp_dirname) + except PermissionError: + # windows error + pass + + # also test injecting directly + del model + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + inject_adapter_in_model(config, model, low_cpu_mem_usage=True) # check that there is no error + + if not isinstance(config, LNTuningConfig): + # LN tuning does not add adapter layers that could be on meta device, it only changes the requires_grad. + # Therefore, there is no meta device for LN tuning. + assert "meta" in {p.device.type for p in model.parameters()} + + def _test_save_pretrained(self, model_id, config_cls, config_kwargs, safe_serialization=True): + # ensure that the weights are randomly initialized + if issubclass(config_cls, LoraConfig): + config_kwargs = config_kwargs.copy() + config_kwargs["init_lora_weights"] = False + if issubclass(config_cls, IA3Config): + config_kwargs = config_kwargs.copy() + config_kwargs["init_ia3_weights"] = False + if hasattr(config_cls, "init_weights"): + config_kwargs = config_kwargs.copy() + config_kwargs["init_weights"] = False + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + if safe_serialization: + model.save_pretrained(tmp_dirname) + else: + model.save_pretrained(tmp_dirname, safe_serialization=False) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + with warnings.catch_warnings(record=True) as recs: + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + # ensure that there is no warning + assert not any("Found missing adapter keys" in str(rec.message) for rec in recs) + + # check if the state dicts are equal + if issubclass(config_cls, PromptEncoderConfig): + # For prompt encoding, when loading the whole state_dict, there are differences, therefore, only load + # adapter-specific weights for comparison. + # TODO: is this expected? + state_dict = get_peft_model_state_dict(model, unwrap_compiled=True) + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained, unwrap_compiled=True) + else: + state_dict = get_state_dict(model, unwrap_compiled=True) + state_dict_from_pretrained = get_state_dict(model_from_pretrained, unwrap_compiled=True) + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + target_adapter_filename = "adapter_model.safetensors" if safe_serialization else "adapter_model.bin" + + # check if `adapter_model.safetensors` is present + assert os.path.exists(os.path.join(tmp_dirname, target_adapter_filename)) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + + # check if `model.safetensors` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "model.safetensors")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + + self.check_modelcard(tmp_dirname, model) + self.check_config_json(tmp_dirname, model) + + def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs, safe_serialization=True): + if issubclass(config_cls, AdaLoraConfig): + # AdaLora does not support adding more than 1 adapter + return pytest.skip(f"Test not applicable for {config_cls}") + + # ensure that the weights are randomly initialized + if issubclass(config_cls, LoraConfig): + config_kwargs = config_kwargs.copy() + config_kwargs["init_lora_weights"] = False + elif issubclass(config_cls, IA3Config): + config_kwargs = config_kwargs.copy() + config_kwargs["init_ia3_weights"] = False + elif hasattr(config_cls, "init_weights"): + config_kwargs["init_weights"] = False + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + new_adapter_config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + model.add_adapter("new_adapter", new_adapter_config) + + with tempfile.TemporaryDirectory() as tmp_dirname: + if safe_serialization: + model.save_pretrained(tmp_dirname) + else: + model.save_pretrained(tmp_dirname, safe_serialization=False) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + new_adapter_dir = os.path.join(tmp_dirname, "new_adapter") + model_from_pretrained.load_adapter(new_adapter_dir, "new_adapter") + + # check if the state dicts are equal + if issubclass(config_cls, PromptEncoderConfig): + # For prompt encoding, when loading the whole state_dict, there are differences, therefore, only load + # adapter-specific weights for comparison. + # TODO: is this expected? + state_dict = get_peft_model_state_dict(model, unwrap_compiled=True) + state_dict_from_pretrained = get_peft_model_state_dict(model_from_pretrained, unwrap_compiled=True) + else: + state_dict = get_state_dict(model, unwrap_compiled=True) + state_dict_from_pretrained = get_state_dict(model_from_pretrained, unwrap_compiled=True) + + # check if same keys + assert state_dict.keys() == state_dict_from_pretrained.keys() + + # check if tensors equal + for key in state_dict.keys(): + assert torch.allclose( + state_dict[key].to(self.torch_device), state_dict_from_pretrained[key].to(self.torch_device) + ) + + target_adapter_filename = "adapter_model.safetensors" if safe_serialization else "adapter_model.bin" + + # check if `adapter_model.safetensors` is present + assert os.path.exists(os.path.join(tmp_dirname, target_adapter_filename)) + assert os.path.exists(os.path.join(new_adapter_dir, target_adapter_filename)) + + # check if `adapter_config.json` is present + assert os.path.exists(os.path.join(tmp_dirname, "adapter_config.json")) + assert os.path.exists(os.path.join(new_adapter_dir, "adapter_config.json")) + + # check if `model.safetensors` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "model.safetensors")) + assert not os.path.exists(os.path.join(new_adapter_dir, "model.safetensors")) + + # check if `config.json` is not present + assert not os.path.exists(os.path.join(tmp_dirname, "config.json")) + assert not os.path.exists(os.path.join(new_adapter_dir, "config.json")) + + self.check_modelcard(tmp_dirname, model) + self.check_config_json(tmp_dirname, model) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname, selected_adapters=["default"]) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname) + + assert "default" in model_from_pretrained.peft_config.keys() + assert "new_adapter" not in model_from_pretrained.peft_config.keys() + + def _test_from_pretrained_config_construction(self, model_id, config_cls, config_kwargs): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls(base_model_name_or_path=model_id, **config_kwargs) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained( + model_from_pretrained, tmp_dirname, is_trainable=False, config=config + ) + + assert model_from_pretrained.peft_config["default"].inference_mode + assert model_from_pretrained.peft_config["default"] is config + + def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): + # just ensure that this works and raises no error + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + del model + + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + model = PeftModel.from_pretrained(model, tmp_dirname, torch_device=self.torch_device) + load_result1 = model.load_adapter(tmp_dirname, adapter_name="other") + load_result2 = model.load_adapter(tmp_dirname, adapter_name="yet-another") + + # VBLoRA uses a shared "vblora_vector_bank" across all layers, causing it to appear + # in the missing keys list, which leads to failed test cases. So + # skipping the missing keys check for VBLoRA. + if config.peft_type != "VBLORA": + assert load_result1.missing_keys == [] + assert load_result2.missing_keys == [] + + def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): + if ( + config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) + or config_kwargs.get("alora_invocation_tokens") is not None + ): + # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) + if config_kwargs.get("alora_invocation_tokens") is None: + return pytest.skip(f"Test not applicable for {config_cls}") + else: + return pytest.skip("Test not applicable for Activated LoRA") + if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): + self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") + + if (self.torch_device in ["cpu"]) and (version.parse(torch.__version__) <= version.parse("2.1")): + self.skipTest("PyTorch 2.1 not supported for Half of addmm_impl_cpu_ ") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.float16) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(device=self.torch_device, dtype=torch.float16) + + model.eval() + + # This should simply work + _ = model.merge_and_unload() + + def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): + if ( + config_cls + not in ( + LoraConfig, + IA3Config, + AdaLoraConfig, + LoHaConfig, + LoKrConfig, + VeraConfig, + FourierFTConfig, + ) + or config_kwargs.get("alora_invocation_tokens") is not None + ): + # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) + return + if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): + self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") + + if "gemma" in model_id.lower(): + # TODO: could be related to tied weights + self.skipTest("Merging currently fails with gemma") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + self.perturb_trainable_token_weights_if_used(model, config_kwargs) + + dummy_input = self.prepare_inputs_for_testing() + + model.eval() + + # This should work + logits_unmerged = model(**dummy_input)[0] + + model = model.merge_and_unload() + logits_merged = model(**dummy_input)[0] + + assert torch.allclose(logits_unmerged, logits_merged, atol=1e-3, rtol=1e-3) + + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + for name, module in model.named_parameters(): + if ( + "lora_A" in name + or "ia3" in name + or "lora_E" in name + or "lora_B" in name + or "vera_lambda" in name + or "fourierft_spectrum" in name + ): + module.data[0] = torch.nan + + with pytest.raises( + ValueError, match="NaNs detected in the merged weights. The adapter default seems to be broken" + ): + model = model.merge_and_unload(safe_merge=True) + + for name, module in model.named_parameters(): + if ( + "lora_A" in name + or "ia3" in name + or "lora_E" in name + or "lora_B" in name + or "vera_lambda" in name + or "fourierft_spectrum" in name + ): + module.data[0] = torch.inf + + with pytest.raises( + ValueError, match="NaNs detected in the merged weights. The adapter default seems to be broken" + ): + model = model.merge_and_unload(safe_merge=True) + + def _test_merge_layers(self, model_id, config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + return pytest.skip(f"Test not applicable for {config_cls}") + + if issubclass(config_cls, (OFTConfig, BOFTConfig)): + return pytest.skip(f"Test not applicable for {config_cls}") + + if config_kwargs.get("alora_invocation_tokens") is not None: + return pytest.skip("Merging not applicable to aLoRA") + + if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): + self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") + + if "gemma" in model_id.lower(): + # TODO: could be related to tied weights + self.skipTest("Merging currently fails with gemma") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + self.perturb_trainable_token_weights_if_used(model, config_kwargs) + + dummy_input = self.prepare_inputs_for_testing() + model.eval() + logits = model(**dummy_input)[0] + + model.merge_adapter() + logits_merged = model(**dummy_input)[0] + model.unmerge_adapter() + logits_unmerged = model(**dummy_input)[0] + + model = model.merge_and_unload() + + # check that PEFT layers are completely removed + assert not any(isinstance(module, BaseTunerLayer) for module in model.modules()) + logits_merged_unloaded = model(**dummy_input)[0] + + conv_ids = ["Conv2d", "Conv3d", "Conv2d2"] + atol, rtol = 1e-4, 1e-4 + if self.torch_device in ["mlu"]: + atol, rtol = 1e-3, 1e-3 # MLU + if config.peft_type == "ADALORA": + # AdaLoRA is a bit flaky on CI, but this cannot be reproduced locally + atol, rtol = 1e-2, 1e-2 + if (config.peft_type in {"IA3", "LORA"}) and (model_id in conv_ids): + # for some reason, the Conv introduces a larger error + atol, rtol = 0.3, 0.01 + if model_id == "trl-internal-testing/tiny-Llama4ForCausalLM": + # also getting larger errors here, not exactly sure why + atol, rtol = 0.3, 0.01 + assert torch.allclose(logits, logits_merged, atol=atol, rtol=rtol) + assert torch.allclose(logits, logits_unmerged, atol=atol, rtol=rtol) + assert torch.allclose(logits, logits_merged_unloaded, atol=atol, rtol=rtol) + + # For this test to work, weights should not be initialized to identity transform (e.g. + # init_lora_weights should be False). + transformers_model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + logits_transformers = transformers_model(**dummy_input)[0] + assert not torch.allclose(logits_merged, logits_transformers, atol=1e-10, rtol=1e-10) + + # test that the logits are identical after a save-load-roundtrip + if hasattr(model, "save_pretrained"): + # model is a transformers model + tmp_dirname = tempfile.mkdtemp() + # note: not using the context manager here because it fails on Windows CI for some reason + try: + model.save_pretrained(tmp_dirname) + model_from_pretrained = self.transformers_class.from_pretrained(tmp_dirname).to(self.torch_device) + finally: + try: + shutil.rmtree(tmp_dirname) + except PermissionError: + # windows error + pass + else: + # model is not a transformers model + model_from_pretrained = pickle.loads(pickle.dumps(model)) + + logits_merged_from_pretrained = model_from_pretrained(**dummy_input)[0] + assert torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol) + + def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): + supported_peft_types = [ + PeftType.LORA, + PeftType.LOHA, + PeftType.LOKR, + PeftType.IA3, + PeftType.OFT, + PeftType.BOFT, + PeftType.HRA, + PeftType.BONE, + PeftType.MISS, + ] + + if ("gpt2" in model_id.lower()) and (config_cls == IA3Config): + self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") + + if config_kwargs.get("trainable_token_indices", None) is not None: + self.skipTest( + "Merging two adapters with trainable tokens is tested elsewhere since adapters with " + "the same token indices cannot be merged." + ) + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + if config.peft_type not in supported_peft_types or config_kwargs.get("alora_invocation_tokens") is not None: + return + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + dummy_input = self.prepare_inputs_for_testing() + model.eval() + + with torch.inference_mode(): + logits_adapter_1 = model(**dummy_input)[0] + + model.add_adapter("adapter-2", config) + model.set_adapter("adapter-2") + model.eval() + + # sanity check: each adapter layer with a 'default' adapter should also have 'adapter-2' + containers = (torch.nn.ModuleDict, torch.nn.ParameterDict, BufferDict) + num_default = len([m for m in model.modules() if isinstance(m, containers) and "default" in m]) + num_adapter2 = len([m for m in model.modules() if isinstance(m, containers) and "adapter-2" in m]) + assert num_default > 0 + assert num_default == num_adapter2 + + with torch.inference_mode(): + logits_adapter_2 = model(**dummy_input)[0] + + assert not torch.allclose(logits_adapter_1, logits_adapter_2, atol=1e-3, rtol=1e-3) + + model.set_adapter("default") + + with torch.inference_mode(): + logits_adapter_1_after_set = model(**dummy_input)[0] + + assert torch.allclose(logits_adapter_1_after_set, logits_adapter_1, atol=1e-3, rtol=1e-3) + + model_copy = copy.deepcopy(model) + model_copy_2 = copy.deepcopy(model) + model_merged_all = model.merge_and_unload(adapter_names=["adapter-2", "default"]) + + with torch.inference_mode(): + logits_merged_all = model_merged_all(**dummy_input)[0] + + assert not torch.allclose(logits_merged_all, logits_adapter_2, atol=1e-3, rtol=1e-3) + assert not torch.allclose(logits_merged_all, logits_adapter_1, atol=1e-3, rtol=1e-3) + + model_merged_adapter_2 = model_copy.merge_and_unload(adapter_names=["adapter-2"]) + + with torch.inference_mode(): + logits_merged_adapter_2 = model_merged_adapter_2(**dummy_input)[0] + + assert torch.allclose(logits_merged_adapter_2, logits_adapter_2, atol=1e-3, rtol=1e-3) + + model_merged_adapter_default = model_copy_2.merge_and_unload(adapter_names=["default"]) + + with torch.inference_mode(): + logits_merged_adapter_default = model_merged_adapter_default(**dummy_input)[0] + + assert torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3) + + def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): + if config_kwargs.get("alora_invocation_tokens") is not None: + # Merging not supported for Activated LoRA (aLoRA) + return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + model.eval() + torch.manual_seed(0) + model.merge_adapter() + logits_0 = model(**self.prepare_inputs_for_testing())[0] + + # merging again should not change anything + # also check warning: + with pytest.warns(UserWarning, match="All adapters are already merged, nothing to do"): + model.merge_adapter() + logits_1 = model(**self.prepare_inputs_for_testing())[0] + + assert torch.allclose(logits_0, logits_1, atol=1e-6, rtol=1e-6) + + def _test_safe_merge(self, model_id, config_cls, config_kwargs): + if config_kwargs.get("alora_invocation_tokens") is not None: + # Merging not supported for Activated LoRA (aLoRA) + return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + torch.manual_seed(0) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = model.to(self.torch_device).eval() + + inputs = self.prepare_inputs_for_testing() + logits_base = model(**inputs)[0] + + model = get_peft_model(model, config).eval() + logits_peft = model(**inputs)[0] + + atol, rtol = 1e-6, 1e-6 # default + # Initializing with LN tuning cannot be configured to change the outputs (unlike init_lora_weights=False) + if not issubclass(config_cls, LNTuningConfig): + # sanity check that the logits are different + assert not torch.allclose(logits_base, logits_peft, atol=atol, rtol=rtol) + + model_unloaded = model.merge_and_unload(safe_merge=True) + logits_unloaded = model_unloaded(**inputs)[0] + + if self.torch_device in ["mlu"]: + atol, rtol = 1e-3, 1e-3 # MLU + + conv_ids = ["Conv2d", "Conv3d", "Conv2d2"] + if issubclass(config_cls, (IA3Config, LoraConfig)) and model_id in conv_ids: # more instability with Conv + atol, rtol = 1e-3, 1e-3 + + # check that the logits are the same after unloading + assert torch.allclose(logits_peft, logits_unloaded, atol=atol, rtol=rtol) + + # Ensure that serializing with safetensors works, there was an error when weights were not contiguous + with tempfile.TemporaryDirectory() as tmp_dirname: + # serializing with torch.save works + torch.save(model_unloaded.state_dict(), os.path.join(tmp_dirname, "model.bin")) + + # serializing with safetensors works + save_file(model_unloaded.state_dict(), os.path.join(tmp_dirname, "model.safetensors")) + + def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): + # Test for mixing different adapters in a single batch by passing the adapter_names argument + if config_cls not in (LoraConfig,): + return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + torch.manual_seed(0) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config, adapter_name="adapter0").eval() + model.add_adapter("adapter1", config) + model = model.to(self.torch_device).eval() + + self.perturb_trainable_token_weights_if_used(model, config_kwargs, adapter_name="adapter0") + self.perturb_trainable_token_weights_if_used(model, config_kwargs, adapter_name="adapter1") + + dummy_input = self.prepare_inputs_for_testing() + # ensure that we have at least 3 samples for this test + dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} + with torch.inference_mode(): + with model.disable_adapter(): + output_base = model(**dummy_input)[0] + logits_base = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] + + model.set_adapter("adapter0") + with torch.inference_mode(): + output_adapter0 = model(**dummy_input)[0] + logits_adapter0 = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] + + model.set_adapter("adapter1") + with torch.inference_mode(): + output_adapter1 = model(**dummy_input)[0] + logits_adapter1 = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] + + atol, rtol = 1e-4, 1e-4 + # sanity check that there are enough outputs and that they are different + assert len(output_base) == len(output_adapter0) == len(output_adapter1) >= 3 + assert len(logits_base) == len(logits_adapter0) == len(logits_adapter1) >= 3 + assert not torch.allclose(output_base, output_adapter0, atol=atol, rtol=rtol) + assert not torch.allclose(output_base, output_adapter1, atol=atol, rtol=rtol) + assert not torch.allclose(output_adapter0, output_adapter1, atol=atol, rtol=rtol) + assert not torch.allclose(logits_base, logits_adapter0, atol=atol, rtol=rtol) + assert not torch.allclose(logits_base, logits_adapter1, atol=atol, rtol=rtol) + assert not torch.allclose(logits_adapter0, logits_adapter1, atol=atol, rtol=rtol) + + # alternate between base model, adapter0, and adapter1 + adapters = ["__base__", "adapter0", "adapter1"] + dummy_input["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] + with torch.inference_mode(): + output_mixed = model(**dummy_input)[0] + logits_mixed = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] + + assert torch.allclose(output_base[::3], output_mixed[::3], atol=atol, rtol=rtol) + assert torch.allclose(output_adapter0[1::3], output_mixed[1::3], atol=atol, rtol=rtol) + assert torch.allclose(output_adapter1[2::3], output_mixed[2::3], atol=atol, rtol=rtol) + assert torch.allclose(logits_base[::3], logits_mixed[::3], atol=atol, rtol=rtol) + assert torch.allclose(logits_adapter0[1::3], logits_mixed[1::3], atol=atol, rtol=rtol) + assert torch.allclose(logits_adapter1[2::3], logits_mixed[2::3], atol=atol, rtol=rtol) + + def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, config_cls, config_kwargs): + # Test generating with beam search and with mixing different adapters in a single batch by passing the + # adapter_names argument. See #2283. + if config_cls not in (LoraConfig,): + return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + if config_kwargs.get("alora_invocation_tokens") is not None: + return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported + if config_kwargs.get("trainable_token_indices", None) is not None: + # for some configurations this test will fail since the adapter values don't differ. + # this is probably a problem with the test setup and not with the implementation. + return pytest.skip("Trainable token indices is not supported here (yet).") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + torch.manual_seed(0) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config, adapter_name="adapter0").eval() + model.add_adapter("adapter1", config) + + # In contrast to forward, for generate, it can sometimes happen that we get the same results as the base model + # even with LoRA applied because the impact of LoRA is not big enough. Therefore, use this "trick" to make LoRA + # stronger. + for name, param in model.named_parameters(): + if model.base_model.prefix in name: + param.data.mul_(10.0) + + model = model.to(self.torch_device).eval() + + dummy_input = self.prepare_inputs_for_testing() + # ensure that we have at least 3 samples for this test + dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} + gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": 10, "early_stopping": True} + with torch.inference_mode(): + with model.disable_adapter(): + gen_base = model.generate(**gen_kwargs) + + model.set_adapter("adapter0") + with torch.inference_mode(): + gen_adapter0 = model.generate(**gen_kwargs) + + model.set_adapter("adapter1") + with torch.inference_mode(): + gen_adapter1 = model.generate(**gen_kwargs) + + def remove_padding(seq, pad_value): + lst = list(seq) + while lst and (lst[-1] == pad_value): + lst.pop() + return lst + + def gens_are_same(gen0, gen1): + # Special function to compare generations. We cannot use torch.allclose it will raise an error when sequence + # lengths differ. Morevoer, we need to remove the padding from the sequences. This is because, even though + # normally identical sequences should have the same length, when we do mixed adapter batches, each sample + # will be padded to the longest sequence in that mixed batch, which can be different from the longest + # sequence without mixed adapter batches. + pad_value = model.config.eos_token_id + for sample0, sample1 in zip(gen0, gen1): + sample0 = remove_padding(sample0, pad_value) + sample1 = remove_padding(sample1, pad_value) + if (len(sample0) != len(sample1)) or (sample0 != sample1): + # at least one sample differs, the generations are not identical + return False + return True + + # sanity check that there are enough outputs and that they are different + assert len(gen_base) == len(gen_adapter0) == len(gen_adapter1) + assert len(gen_adapter1) >= 3 + assert not gens_are_same(gen_base, gen_adapter0) + assert not gens_are_same(gen_base, gen_adapter1) + assert not gens_are_same(gen_adapter0, gen_adapter1) + + # alternate between base model, adapter0, and adapter1 + adapters = ["__base__", "adapter0", "adapter1"] + gen_kwargs["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] + + with torch.inference_mode(): + gen_mixed = model.generate(**gen_kwargs) + + assert gens_are_same(gen_base[::3], gen_mixed[::3]) + assert gens_are_same(gen_adapter0[1::3], gen_mixed[1::3]) + assert gens_are_same(gen_adapter1[2::3], gen_mixed[2::3]) + + def _test_generate(self, model_id, config_cls, config_kwargs): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `generate` works + _ = model.generate(**inputs) + + def _test_generate_pos_args(self, model_id, config_cls, config_kwargs, raises_err: bool): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + if raises_err: + with pytest.raises(TypeError): + # check if `generate` raises an error if positional arguments are passed + _ = model.generate(inputs["input_ids"]) + else: + # check if `generate` works if positional arguments are passed + _ = model.generate(inputs["input_ids"]) + + def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): + if config_cls not in (IA3Config, LoraConfig, PrefixTuningConfig): + return pytest.skip(f"Test not applicable for {config_cls}") + + if self.torch_device == "mps": # BFloat16 is not supported on MPS + return pytest.skip("BFloat16 is not supported on MPS") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id, torch_dtype=torch.bfloat16) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + input_ids = torch.LongTensor([[1, 1, 1], [2, 1, 2]]).to(self.torch_device) + attention_mask = torch.LongTensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) + + # check if `generate` works + _ = model.generate(input_ids=input_ids, attention_mask=attention_mask) + + def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_kwargs): + if config_cls not in (PrefixTuningConfig,): + return pytest.skip(f"Test not applicable for {config_cls}") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.half() + + assert model.base_model_torch_dtype == torch.float16 + + def _test_training(self, model_id, config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + return pytest.skip(f"Test not applicable for {config_cls}") + if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): + # TODO: no gradients on the "dense" layer, other layers work, not sure why + self.skipTest("AdaLora with RoBERTa does not work correctly") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `training` works + output = model(**inputs)[0] + loss = output.sum() + loss.backward() + parameter_prefix = model.prefix + for n, param in model.named_parameters(): + if (parameter_prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): + assert param.grad is not None + else: + assert param.grad is None + + def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): + if (config_cls == PrefixTuningConfig) and ("deberta" in model_id.lower()): + # TODO: raises an error: + # TypeError: DebertaModel.forward() got an unexpected keyword argument 'past_key_values' + self.skipTest("DeBERTa with PrefixTuning does not work correctly") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `training` works + output = model(**inputs)[0] + logits = output[0] + + loss = output.sum() + loss.backward() + + # set to eval mode, since things like dropout can affect the output otherwise + model.eval() + logits = model(**inputs)[0][0] + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname, safe_serialization=True) + assert "adapter_model.safetensors" in os.listdir(tmp_dirname) + assert "adapter_model.bin" not in os.listdir(tmp_dirname) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname).to( + self.torch_device + ) + + logits_from_pretrained = model_from_pretrained(**inputs)[0][0] + assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) + + def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): + if config_cls not in (LoraConfig,): + return pytest.skip(f"Test not applicable for {config_cls}") + + config = config_cls( + base_model_name_or_path=model_id, + layers_to_transform=[0], + **config_kwargs, + ) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `training` works + output = model(**inputs)[0] + logits = output[0] + + loss = output.sum() + loss.backward() + + has_trainable_tokens = config_kwargs.get("trainable_token_indices", None) is not None + nb_trainable = 0 + + for n, param in model.named_parameters(): + if model.prefix in n or (has_trainable_tokens and "trainable_tokens" in n): + assert param.grad is not None + nb_trainable += 1 + else: + assert param.grad is None + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + model_from_pretrained = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname).to( + self.torch_device + ) + + logits_from_pretrained = model_from_pretrained(**inputs)[0][0] + assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) + + # check the nb of trainable params again but without layers_to_transform + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + nb_trainable_all = 0 + + for n, param in model.named_parameters(): + if model.prefix in n or (has_trainable_tokens and "trainable_tokens" in n): + nb_trainable_all += 1 + + mod_list = next((m for m in model.modules() if isinstance(m, torch.nn.ModuleList)), None) + if mod_list and len(mod_list) == 1: + # there is only a single layer + assert nb_trainable == nb_trainable_all + else: + # more than 1 layer, i.e. setting layers_to_transform=[0] should target fewer layers + assert nb_trainable < nb_trainable_all + + def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwargs): + if config_cls == PrefixTuningConfig: + return pytest.skip(f"Test not applicable for {config_cls}") + + if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): + # TODO: no gradients on the "dense" layer, other layers work, not sure why + self.skipTest("AdaLora with RoBERTa does not work correctly") + + if (config_cls == OFTConfig) and ("deberta" in model_id.lower()): + # TODO: no gradients on the "dense" layer, other layers work, not sure why + self.skipTest("OFT with Deberta does not work correctly") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + + if not getattr(model, "supports_gradient_checkpointing", False): + return pytest.skip(f"Model {model_id} does not support gradient checkpointing") + + model.gradient_checkpointing_enable() + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `training` works + output = model(**inputs)[0] + + loss = output.sum() + loss.backward() + + for n, param in model.named_parameters(): + if "prompt_encoder." in n: # prompt tuning methods + if not issubclass(config_cls, CPTConfig): + assert param.grad is not None + elif ( + "delta_embedding" in n + ): # delta_embedding is the embedding that should be updated with grads in CPT + assert param.grad is not None + elif hasattr(model, "prefix") and (model.prefix in n): # non-prompt tuning methods + assert param.grad is not None + elif "trainable_tokens_" in n: # trainable tokens layer + assert param.grad is not None + else: + assert param.grad is None + + def _test_peft_model_device_map(self, model_id, config_cls, config_kwargs): + if config_cls not in (LoraConfig, VBLoRAConfig): + return pytest.skip(f"Test not applicable for {config_cls}") + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + with tempfile.TemporaryDirectory() as tmp_dirname: + model.save_pretrained(tmp_dirname) + + model_from_pretrained = self.transformers_class.from_pretrained(model_id) + _ = PeftModel.from_pretrained(model_from_pretrained, tmp_dirname, device_map={"": "cpu"}).to( + self.torch_device + ) + + def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): + if not issubclass(config_cls, PromptLearningConfig): + return pytest.skip(f"Test not applicable for {config_cls}") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + inputs = self.prepare_inputs_for_testing() + + # check if `training` works + output = model(**inputs)[0] + loss = output.sum() + loss.backward() + + if issubclass(config_cls, CPTConfig): + parameters = [] + for name, param in model.prompt_encoder.named_parameters(): + if name != "default.embedding.weight": + parameters.append(param) + else: + parameters = model.prompt_encoder.parameters() + + # check that prompt encoder has grads + for param in parameters: + assert param.grad is not None + + def _test_delete_adapter(self, model_id, config_cls, config_kwargs): + supported_peft_types = [ + PeftType.LORA, + PeftType.LOHA, + PeftType.LOKR, + PeftType.IA3, + PeftType.OFT, + PeftType.BOFT, + PeftType.VERA, + PeftType.FOURIERFT, + PeftType.HRA, + PeftType.VBLORA, + PeftType.BONE, + PeftType.MISS, + ] + # IA3 does not support deleting adapters yet, but it just needs to be added + # AdaLora does not support multiple adapters + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + if config.peft_type not in supported_peft_types: + return pytest.skip(f"Test not applicable for {config.peft_type}") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + adapter_to_delete = "delete_me" + model = get_peft_model(model, config) + model.add_adapter(adapter_to_delete, config) + model.set_adapter(adapter_to_delete) + model = model.to(self.torch_device) + model.delete_adapter(adapter_to_delete) + assert adapter_to_delete not in model.peft_config + assert model.active_adapters == ["default"] + + key_list = [key for key, _ in model.named_modules()] + for key in key_list: + _, target, _ = _get_submodules(model, key) + attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr( + target, "other_param_names", [] + ) + for attr in attributes_to_check: + assert adapter_to_delete not in attrgetter(attr)(target) + + # check auxiliary modules + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + assert adapter_to_delete not in module._adapters + assert module.active_adapters == ["default"] + if isinstance(module, ModulesToSaveWrapper): + assert adapter_to_delete not in module.modules_to_save + elif isinstance(module, TrainableTokensWrapper): + assert adapter_to_delete not in module.token_adapter.trainable_tokens_delta + assert adapter_to_delete not in module.token_adapter.trainable_tokens_original + + # check that we can also delete the last remaining adapter + model.delete_adapter("default") + assert "default" not in model.peft_config + assert model.active_adapters == [] + + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + assert "default" not in module._adapters + assert module.active_adapters == [] + if isinstance(module, ModulesToSaveWrapper): + assert "default" not in module.modules_to_save + elif isinstance(module, TrainableTokensWrapper): + assert "default" not in module.token_adapter.trainable_tokens_delta + assert "default" not in module.token_adapter.trainable_tokens_original + + input = self.prepare_inputs_for_testing() + # note: we cannot call model(**input) because PeftModel always expects there to be at least one adapter + model.base_model(**input) # should not raise an error + + def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): + # same as test_delete_adapter, but this time an inactive adapter is deleted + supported_peft_types = [ + PeftType.LORA, + PeftType.LOHA, + PeftType.LOKR, + PeftType.IA3, + PeftType.OFT, + PeftType.BOFT, + PeftType.FOURIERFT, + PeftType.HRA, + PeftType.VBLORA, + PeftType.BONE, + PeftType.MISS, + ] + # IA3 does not support deleting adapters yet, but it just needs to be added + # AdaLora does not support multiple adapters + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + if config.peft_type not in supported_peft_types: + return pytest.skip(f"Test not applicable for {config.peft_type}") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + adapter_to_delete = "delete_me" + model = get_peft_model(model, config) + model.add_adapter(adapter_to_delete, config) + # "delete_me" is added but not activated + model = model.to(self.torch_device) + model.delete_adapter(adapter_to_delete) + assert adapter_to_delete not in model.peft_config + assert model.active_adapters == ["default"] + + key_list = [key for key, _ in model.named_modules()] + for key in key_list: + _, target, _ = _get_submodules(model, key) + attributes_to_check = getattr(target, "adapter_layer_names", []) + getattr( + target, "other_param_names", [] + ) + for attr in attributes_to_check: + assert adapter_to_delete not in attrgetter(attr)(target) + + # check auxiliary modules + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + assert adapter_to_delete not in module._adapters + assert module.active_adapters == ["default"] + if isinstance(module, ModulesToSaveWrapper): + assert adapter_to_delete not in module.modules_to_save + elif isinstance(module, TrainableTokensWrapper): + assert adapter_to_delete not in module.token_adapter.trainable_tokens_delta + assert adapter_to_delete not in module.token_adapter.trainable_tokens_original + + # check that we can also delete the last remaining adapter + model.delete_adapter("default") + assert "default" not in model.peft_config + assert model.active_adapters == [] + + for module in model.modules(): + if isinstance(module, AuxiliaryTrainingWrapper): + assert "default" not in module._adapters + assert module.active_adapters == [] + if isinstance(module, ModulesToSaveWrapper): + assert "default" not in module.modules_to_save + elif isinstance(module, TrainableTokensWrapper): + assert "default" not in module.token_adapter.trainable_tokens_delta + assert "default" not in module.token_adapter.trainable_tokens_original + + input = self.prepare_inputs_for_testing() + # note: we cannot call model(**input) because PeftModel always expects there to be at least one adapter + model.base_model(**input) # should not raise an error + + def _test_delete_unknown_adapter_raises(self, model_id, config_cls, config_kwargs): + # Check that we get a nice error message when trying to delete an adapter that does not exist. + config = config_cls(base_model_name_or_path=model_id, **config_kwargs) + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + adapter_to_delete = "delete_me" + model = get_peft_model(model, config) + + msg = "Adapter unknown-adapter does not exist" + with pytest.raises(ValueError, match=msg): + model.delete_adapter("unknown-adapter") + + def _test_unload_adapter(self, model_id, config_cls, config_kwargs): + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + num_params_base = len(model.state_dict()) + dummy_input = self.prepare_inputs_for_testing() + with torch.inference_mode(): + logits_transformers = model(**dummy_input)[0] + + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config) + model = model.to(self.torch_device) + + if isinstance(config, PromptLearningConfig): + # prompt learning does not support unloading + with pytest.raises(AttributeError): + model = model.unload() + else: + self.perturb_trainable_token_weights_if_used(model, config_kwargs) + with torch.inference_mode(): + logits_with_adapter = model(**dummy_input)[0] + + model.eval() + model = model.unload() + num_params_unloaded = len(model.state_dict()) + with torch.inference_mode(): + logits_unload = model(**dummy_input)[0] + + # check that PEFT layers are completely removed + assert not any(isinstance(module, BaseTunerLayer) for module in model.modules()) + assert not torch.allclose(logits_with_adapter, logits_unload, atol=1e-10, rtol=1e-10) + assert torch.allclose(logits_transformers, logits_unload, atol=1e-4, rtol=1e-4) + assert num_params_base == num_params_unloaded + + def _test_weighted_combination_of_adapters_lora(self, model, config, adapter_list, weight_list): + model.add_adapter(adapter_list[1], config) + model.add_adapter(adapter_list[2], replace(config, r=20)) + model = model.to(self.torch_device) + + # test re-weighting single adapter + model.add_weighted_adapter([adapter_list[0]], [weight_list[0]], "single_adapter_reweighting") + + # test svd re-weighting with multiple adapters + model.add_weighted_adapter(adapter_list[1:], weight_list[1:], "multi_adapter_svd_reweighting") + + # test ties_svd re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_ties_svd_reweighting", + combination_type="ties_svd", + density=0.5, + ) + + # test dare_linear_svd re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_dare_linear_svd_reweighting", + combination_type="dare_linear_svd", + density=0.5, + ) + + # test dare_ties_svd re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_dare_ties_svd_reweighting", + combination_type="dare_ties_svd", + density=0.5, + ) + + # test magnitude_prune_svd re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_magnitude_prune_svd_reweighting", + combination_type="magnitude_prune_svd", + density=0.5, + ) + + # test cat re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[1:], weight_list[1:], "multi_adapter_cat_reweighting", combination_type="cat" + ) + + # test linear re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[:2], weight_list[:2], "multi_adapter_linear_reweighting", combination_type="linear" + ) + + # test ties re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[:2], weight_list[:2], "multi_adapter_ties_reweighting", combination_type="ties", density=0.5 + ) + + # test dare_linear re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[:2], + weight_list[:2], + "multi_adapter_dare_linear_reweighting", + combination_type="dare_linear", + density=0.5, + ) + + # test dare_ties re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[:2], + weight_list[:2], + "multi_adapter_dare_ties_reweighting", + combination_type="dare_ties", + density=0.5, + ) + + # test magnitude_prune re-weighting with multiple adapters + model.add_weighted_adapter( + adapter_list[:2], + weight_list[:2], + "multi_adapter_magnitude_prune_reweighting", + combination_type="magnitude_prune", + density=0.5, + ) + + # test linear re-weighting with multiple adapters with only first adapter having non zero weight + model.add_weighted_adapter( + adapter_list[:2], + [weight_list[0], 0], + "multi_adapter_linear_reweighting_single_enabled", + combination_type="linear", + ) + + with pytest.raises(ValueError): + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_linear_reweighting_uneven_r", + combination_type="linear", + ) + + with pytest.raises(ValueError): + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_ties_reweighting_uneven_r", + combination_type="ties", + density=0.5, + ) + + with pytest.raises(ValueError): + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_dare_linear_reweighting_uneven_r", + combination_type="dare_linear", + density=0.5, + ) + + with pytest.raises(ValueError): + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_dare_ties_reweighting_uneven_r", + combination_type="dare_ties", + density=0.5, + ) + + with pytest.raises(ValueError): + model.add_weighted_adapter( + adapter_list[1:], + weight_list[1:], + "multi_adapter_magnitude_prune_reweighting_uneven_r", + combination_type="magnitude_prune", + density=0.5, + ) + + new_adapters = [ + "single_adapter_reweighting", + "multi_adapter_svd_reweighting", + "multi_adapter_ties_svd_reweighting", + "multi_adapter_dare_linear_svd_reweighting", + "multi_adapter_dare_ties_svd_reweighting", + "multi_adapter_magnitude_prune_svd_reweighting", + "multi_adapter_cat_reweighting", + "multi_adapter_linear_reweighting", + "multi_adapter_linear_reweighting_single_enabled", + "multi_adapter_ties_reweighting", + "multi_adapter_dare_linear_reweighting", + "multi_adapter_dare_ties_reweighting", + "multi_adapter_magnitude_prune_reweighting", + ] + for new_adapter in new_adapters: + assert new_adapter in model.peft_config + + key_list = [key for key, _ in model.named_modules()] + for key in key_list: + _, target, _ = _get_submodules(model, key) + if isinstance(target, LoraLayer): + for adapter_name in new_adapters: + if "single" in adapter_name: + new_delta_weight = target.get_delta_weight(adapter_name) + weighted_original_delta_weights = target.get_delta_weight(adapter_list[0]) * weight_list[0] + sign = 1 if weight_list[0] > 0 else -1 + weighted_original_delta_weights = sign * weighted_original_delta_weights + assert torch.allclose(new_delta_weight, weighted_original_delta_weights, atol=1e-4, rtol=1e-4) + elif "svd" in adapter_name: + assert target.r[adapter_name] == 20 + elif "linear" in adapter_name: + assert target.r[adapter_name] == 8 + elif "cat" in adapter_name: + assert target.r[adapter_name] == 28 + + dummy_input = self.prepare_inputs_for_testing() + model.eval() + for adapter_name in new_adapters: + # ensuring new adapters pass the forward loop + model.set_adapter(adapter_name) + assert model.active_adapter == adapter_name + assert model.active_adapters == [adapter_name] + model(**dummy_input)[0] + + def _test_weighted_combination_of_adapters_ia3(self, model, config, adapter_list, weight_list): + model.add_adapter(adapter_list[1], config) + model.add_adapter(adapter_list[2], config) + model = model.to(self.torch_device) + + # test re-weighting single adapter + model.add_weighted_adapter([adapter_list[0]], [weight_list[0]], "single_adapter_reweighting") + + # test re-weighting with multiple adapters + model.add_weighted_adapter(adapter_list[1:], weight_list[1:], "multi_adapter_reweighting") + + new_adapters = [ + "single_adapter_reweighting", + "multi_adapter_reweighting", + ] + for new_adapter in new_adapters: + assert new_adapter in model.peft_config + + dummy_input = self.prepare_inputs_for_testing() + model.eval() + for adapter_name in new_adapters: + # ensuring new adapters pass the forward loop + model.set_adapter(adapter_name) + assert model.active_adapter == adapter_name + assert model.active_adapters == [adapter_name] + model(**dummy_input)[0] + + def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kwargs): + if issubclass(config_cls, AdaLoraConfig): + # AdaLora does not support adding more than 1 adapter + return pytest.skip(f"Test not applicable for {config_cls}") + if model_id.endswith("qwen2"): + # Qwen2 fails with weighted adapter combinations using SVD + return pytest.skip(f"Test does not work with model {model_id}") + if "gemma" in model_id.lower(): + return pytest.skip("Combining Gemma adapters with SVD is currently failing") + + adapter_list = ["adapter1", "adapter_2", "adapter_3"] + weight_list = [0.5, 1.5, 1.5] + negative_weight_list = [-0.5, -0.8, -1.2] + # Initialize the config + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + if not isinstance(config, (LoraConfig, IA3Config)): + # This test is only applicable for Lora and IA3 configs + return pytest.skip(f"Test not applicable for {config}") + + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config, adapter_list[0]) + + if isinstance(config, LoraConfig): + self._test_weighted_combination_of_adapters_lora(model, config, adapter_list, weight_list) + self._test_weighted_combination_of_adapters_lora(model, config, adapter_list, negative_weight_list) + elif isinstance(config, IA3Config): + self._test_weighted_combination_of_adapters_ia3(model, config, adapter_list, weight_list) + self._test_weighted_combination_of_adapters_ia3(model, config, adapter_list, negative_weight_list) + else: + pytest.skip(f"Test not applicable for {config}") + + def _test_disable_adapter(self, model_id, config_cls, config_kwargs): + task_type = config_kwargs.get("task_type") + if (task_type == "SEQ_2_SEQ_LM") and (config_cls in (PromptTuningConfig, PromptEncoderConfig)): + self.skipTest("Seq2Seq + prompt tuning/prompt encoder does not work with disabling adapters") + + def get_output(model): + # helper function that works with different model types + torch.manual_seed(0) + + if hasattr(model, "generate"): + # let's check the scores, not the output ids, since the latter can easily be identical even if the + # weights are slightly changed + output = model.generate(**input, return_dict_in_generate=True, output_scores=True).scores[0] + # take element 0, as output is a tuple + else: + output = model(**input) + + if hasattr(output, "images"): # for SD + import numpy as np + + img = output.images[0] + return torch.from_numpy(np.array(img)) + + return output + + # initialize model + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) + + # output from BASE MODEL + input = self.prepare_inputs_for_testing() + output_before = get_output(model) + + # output from PEFT MODEL + if hasattr(self, "instantiate_sd_peft"): + # SD models are instantiated differently + peft_model = self.instantiate_sd_peft(model_id, config_cls, config_kwargs) + else: + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + peft_model = get_peft_model(model, config) + + # trainable_token_indices doesn't have support for `init_weights` so we have to do this manually + self.perturb_trainable_token_weights_if_used(model, config_kwargs) + + output_peft = get_output(peft_model) + + # first check trivial case is not true that peft does not affect the output; for this to work, init_weight + # must be False (if the config supports it) + if isinstance(peft_model, StableDiffusionPipeline): + # for SD, check that most pixels have different values + assert (output_before != output_peft).float().mean() > 0.8 + else: + assert not torch.allclose(output_before, output_peft) + + # output with DISABLED ADAPTER + if isinstance(peft_model, StableDiffusionPipeline): + with peft_model.unet.disable_adapter(): + with peft_model.text_encoder.disable_adapter(): + output_peft_disabled = get_output(peft_model) + # for SD, very rarely, a pixel can differ + assert (output_before != output_peft_disabled).float().mean() < 1e-4 + else: + atol, rtol = 1e-6, 1e-6 + if (platform.system() == "Windows") and (model_id == "trl-internal-testing/tiny-Llama4ForCausalLM"): + # for some reason, Windows CI fails with stricter tolerance + atol, rtol = 1e-5, 1e-5 + + with peft_model.disable_adapter(): + output_peft_disabled = get_output(peft_model) + assert torch.allclose(output_before, output_peft_disabled, atol=atol, rtol=rtol) + + # after leaving the disable_adapter context, the output should be the same as with enabled adapter again + # see #1501 + output_peft_after_disabled = get_output(peft_model) + assert torch.allclose(output_peft, output_peft_after_disabled, atol=atol, rtol=rtol) + + # TODO: add tests to check if disabling adapters works after calling merge_adapter + + def _test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): + # When trying to add multiple adapters with bias in Lora, AdaLora or BOFTConfig, an error should be + # raised. Also, the peft model should not be left in a half-initialized state. + if not issubclass(config_cls, (LoraConfig, AdaLoraConfig, BOFTConfig)): + return pytest.skip(f"Test not applicable for {config_cls}") + + with hub_online_once(model_id): + config_kwargs = config_kwargs.copy() + config_kwargs["bias"] = "all" + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + + model = self.transformers_class.from_pretrained(model_id) + model = get_peft_model(model, config, "adapter0") + + if config_cls == LoraConfig or config_cls == AdaLoraConfig: + with pytest.raises(ValueError): + model.add_adapter("adapter1", replace(config, r=20)) + + if config_cls == BOFTConfig: + with pytest.raises(ValueError): + model.add_adapter("adapter1", replace(config, boft_block_num=1, boft_block_size=0)) + + # (superficial) test that the model is not left in a half-initialized state when adding an adapter fails + assert "adapter1" not in model.peft_config + assert "adapter1" not in model.base_model.peft_config + + def _test_passing_input_embeds_works(self, test_name, model_id, config_cls, config_kwargs): + # https://github.com/huggingface/peft/issues/727 + with hub_online_once(model_id): + model = self.transformers_class.from_pretrained(model_id) + config = config_cls( + base_model_name_or_path=model_id, + **config_kwargs, + ) + model = get_peft_model(model, config, adapter_name="test-adapter").to(self.torch_device) + dummy_input = self.prepare_inputs_for_testing() + inputs_embeds = model.get_input_embeddings()(dummy_input["input_ids"]) + # just check that no error is raised + model.forward(inputs_embeds=inputs_embeds) diff --git a/peft/tests/testing_utils.py b/peft/tests/testing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ec06fb49e87622a926cbaa0d89771f7f2919f3b8 --- /dev/null +++ b/peft/tests/testing_utils.py @@ -0,0 +1,305 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest +from contextlib import contextmanager +from functools import lru_cache, wraps +from unittest import mock + +import numpy as np +import pytest +import torch +from accelerate.test_utils.testing import get_backend +from datasets import load_dataset + +from peft import ( + AdaLoraConfig, + IA3Config, + LNTuningConfig, + LoraConfig, + PromptLearningConfig, + VBLoRAConfig, +) +from peft.import_utils import ( + is_aqlm_available, + is_auto_awq_available, + is_auto_gptq_available, + is_eetq_available, + is_gptqmodel_available, + is_hqq_available, + is_optimum_available, + is_torchao_available, +) + + +# Globally shared model cache used by `hub_online_once`. +_HUB_MODEL_ACCESSES = {} + + +torch_device, device_count, memory_allocated_func = get_backend() + + +def require_non_cpu(test_case): + """ + Decorator marking a test that requires a hardware accelerator backend. These tests are skipped when there are no + hardware accelerator available. + """ + return unittest.skipUnless(torch_device != "cpu", "test requires a hardware accelerator")(test_case) + + +def require_non_xpu(test_case): + """ + Decorator marking a test that should be skipped for XPU. + """ + return unittest.skipUnless(torch_device != "xpu", "test requires a non-XPU")(test_case) + + +def require_torch_gpu(test_case): + """ + Decorator marking a test that requires a GPU. Will be skipped when no GPU is available. + """ + if not torch.cuda.is_available(): + return unittest.skip("test requires GPU")(test_case) + else: + return test_case + + +def require_torch_multi_gpu(test_case): + """ + Decorator marking a test that requires multiple GPUs. Will be skipped when less than 2 GPUs are available. + """ + if not torch.cuda.is_available() or torch.cuda.device_count() < 2: + return unittest.skip("test requires multiple GPUs")(test_case) + else: + return test_case + + +def require_torch_multi_accelerator(test_case): + """ + Decorator marking a test that requires multiple hardware accelerators. These tests are skipped on a machine without + multiple accelerators. + """ + return unittest.skipUnless( + torch_device != "cpu" and device_count > 1, "test requires multiple hardware accelerators" + )(test_case) + + +def require_bitsandbytes(test_case): + """ + Decorator marking a test that requires the bitsandbytes library. Will be skipped when the library is not installed. + """ + try: + import bitsandbytes # noqa: F401 + + test_case = pytest.mark.bitsandbytes(test_case) + except ImportError: + test_case = pytest.mark.skip(reason="test requires bitsandbytes")(test_case) + return test_case + + +def require_auto_gptq(test_case): + """ + Decorator marking a test that requires auto-gptq. These tests are skipped when auto-gptq isn't installed. + """ + return unittest.skipUnless(is_gptqmodel_available() or is_auto_gptq_available(), "test requires auto-gptq")( + test_case + ) + + +def require_gptqmodel(test_case): + """ + Decorator marking a test that requires gptqmodel. These tests are skipped when gptqmodel isn't installed. + """ + return unittest.skipUnless(is_gptqmodel_available(), "test requires gptqmodel")(test_case) + + +def require_aqlm(test_case): + """ + Decorator marking a test that requires aqlm. These tests are skipped when aqlm isn't installed. + """ + return unittest.skipUnless(is_aqlm_available(), "test requires aqlm")(test_case) + + +def require_hqq(test_case): + """ + Decorator marking a test that requires aqlm. These tests are skipped when aqlm isn't installed. + """ + return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case) + + +def require_auto_awq(test_case): + """ + Decorator marking a test that requires auto-awq. These tests are skipped when auto-awq isn't installed. + """ + return unittest.skipUnless(is_auto_awq_available(), "test requires auto-awq")(test_case) + + +def require_eetq(test_case): + """ + Decorator marking a test that requires eetq. These tests are skipped when eetq isn't installed. + """ + return unittest.skipUnless(is_eetq_available(), "test requires eetq")(test_case) + + +def require_optimum(test_case): + """ + Decorator marking a test that requires optimum. These tests are skipped when optimum isn't installed. + """ + return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) + + +def require_torchao(test_case): + """ + Decorator marking a test that requires torchao. These tests are skipped when torchao isn't installed. + """ + return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case) + + +def require_deterministic_for_xpu(test_case): + @wraps(test_case) + def wrapper(*args, **kwargs): + if torch_device == "xpu": + original_state = torch.are_deterministic_algorithms_enabled() + try: + torch.use_deterministic_algorithms(True) + return test_case(*args, **kwargs) + finally: + torch.use_deterministic_algorithms(original_state) + else: + return test_case(*args, **kwargs) + + return wrapper + + +@contextmanager +def temp_seed(seed: int): + """Temporarily set the random seed. This works for python numpy, pytorch.""" + + np_state = np.random.get_state() + np.random.seed(seed) + + torch_state = torch.random.get_rng_state() + torch.random.manual_seed(seed) + + if torch.cuda.is_available(): + torch_cuda_states = torch.cuda.get_rng_state_all() + torch.cuda.manual_seed_all(seed) + + try: + yield + finally: + np.random.set_state(np_state) + + torch.random.set_rng_state(torch_state) + if torch.cuda.is_available(): + torch.cuda.set_rng_state_all(torch_cuda_states) + + +def get_state_dict(model, unwrap_compiled=True): + """ + Get the state dict of a model. If the model is compiled, unwrap it first. + """ + if unwrap_compiled: + model = getattr(model, "_orig_mod", model) + return model.state_dict() + + +@lru_cache +def load_dataset_english_quotes(): + # can't use pytest fixtures for now because of unittest style tests + data = load_dataset("ybelkada/english_quotes_copy") + return data + + +@lru_cache +def load_cat_image(): + # can't use pytest fixtures for now because of unittest style tests + dataset = load_dataset("huggingface/cats-image", trust_remote_code=True) + image = dataset["test"]["image"][0] + return image + + +def set_init_weights_false(config_cls, kwargs): + # helper function that sets the config kwargs such that the model is *not* initialized as an identity transform + kwargs = kwargs.copy() + + if issubclass(config_cls, PromptLearningConfig): + return kwargs + if config_cls in (LNTuningConfig, VBLoRAConfig): + return kwargs + + if config_cls in (LoraConfig, AdaLoraConfig): + kwargs["init_lora_weights"] = False + elif config_cls == IA3Config: + kwargs["init_ia3_weights"] = False + else: + kwargs["init_weights"] = False + return kwargs + + +@contextmanager +def hub_online_once(model_id: str): + """Set env[HF_HUB_OFFLINE]=1 (and patch transformers/hugging_face_hub to think that it was always that way) + for model ids that were already to avoid contacting the hub twice for the same model id in the context. The global + variable `_HUB_MODEL_ACCESSES` tracks the number of hits per model id between `hub_online_once` calls. + + The reason for doing a context manager and not patching specific methods (e.g., `from_pretrained`) is that there + are a lot of places (`PeftConfig.from_pretrained`, `get_peft_state_dict`, `load_adapter`, ...) that possibly + communicate with the hub to download files / check versions / etc. + + Note that using this context manager can cause problems when used in code sections that access different resources. + Example: + + ``` + def test_something(model_id, config_kwargs): + with hub_online_once(model_id): + model = ...from_pretrained(model_id) + self.do_something_specific_with_model(model) + ``` + It is assumed that `do_something_specific_with_model` is an absract method that is implement by several tests. + Imagine the first test simply does `model.generate([1,2,3])`. The second call from another test suite however uses + a tokenizer (`AutoTokenizer.from_pretrained(model_id)`) - this will fail since the first pass was online but didn't + use the tokenizer and we're now in offline mode and cannot fetch the tokenizer. The recommended workaround is to + extend the cache key (`model_id` passed to `hub_online_once` in this case) by something in case the tokenizer is + used, so that these tests don't share a cache pool with the tests that don't use a tokenizer. + + It is best to avoid using this context manager in *yield* fixtures (normal fixtures are fine) as this is equivalent + to wrapping the whole test in the context manager without explicitly writing it out, leading to unexpected + `HF_HUB_OFFLINE` behavior in the test body. + """ + global _HUB_MODEL_ACCESSES + override = {} + + try: + if model_id in _HUB_MODEL_ACCESSES: + override = {"HF_HUB_OFFLINE": "1"} + _HUB_MODEL_ACCESSES[model_id] += 1 + else: + if model_id not in _HUB_MODEL_ACCESSES: + _HUB_MODEL_ACCESSES[model_id] = 0 + with ( + # strictly speaking it is not necessary to set the environment variable since most code that's out there + # is evaluating it at import time and we'd have to reload the modules for it to take effect. It's + # probably still a good idea to have it if there's some dynamic code that checks it. + mock.patch.dict(os.environ, override), + mock.patch("huggingface_hub.constants.HF_HUB_OFFLINE", override.get("HF_HUB_OFFLINE", False) == "1"), + mock.patch("transformers.utils.hub._is_offline_mode", override.get("HF_HUB_OFFLINE", False) == "1"), + ): + yield + except Exception: + # in case of an error we have to assume that we didn't access the model properly from the hub + # for the first time, so the next call cannot be considered cached. + if _HUB_MODEL_ACCESSES.get(model_id) == 0: + del _HUB_MODEL_ACCESSES[model_id] + raise